commit 6ad826f3ee32aa8c7c2834a759cdf516d046e20c Author: retoor Date: Wed Feb 12 14:40:14 2025 +0100 Initial commit. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..08a196e --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +downie* +__pycache__/ +www* +.venv diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8d5fa6b --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +PYTHON=.venv/bin/python +PIP=.venv/bin/pip +DOWNIE=.venv/bin/downie + + +install: + sudo apt install python3 python3-venv python3-pip -y + python3 -m venv .venv + ${PIP} install -e . + @ln -s ${DOWNIE} downie || true + @echo "Installed downie. Use by executing ./downie [your url to downie]" + +build: + ${PIP} install build + ${PYTHON} -m build . + +example: + ${DOWNIE} https://www.molodetz.nl diff --git a/README.md b/README.md new file mode 100644 index 0000000..bc0814f --- /dev/null +++ b/README.md @@ -0,0 +1,65 @@ +# Downie + +**Downie** is a full **site downloader**. I tried several downloaders and they all were not capable to download big sites like **molodetz.nl** (containing **300.000+** links / resources). See **features** to see what makes this site downloader **better than most competition**. + +If you encounter a site that can't be downloaded using this downloader, contact me @ *retoor@molodetz.nl*. Also, creating a bug ticket is possible after logging in. **I respond to both methods within 24h**. + +## Features + - Fast as light **async** fetching of your pages allowing concurrency. + - Limit fetching only to given domain. **Remote URLs are not followed**. Also redirects to remote URLs (yes, that can be an issue) are not followed. + - Discriminates between file and folder for urls by adding `.html` to pages not having an extension. So `/pony` becomes /pony.html so that `/pony/photos.html` is possible. + - **Absolute URLs will be converted to relative URLs** so that the site becomes **portable** and **will work isolated** on your webserver. + - **Progress is saved.** You do not have to start over completely after exiting the application. It will just continue. + - Caching of already visited urls. + +## Using Downie +After following the **installation instructions**, use `downie https://target-site.com` if **globally installed**. If not globally installed, execute `./downie https://target-site.com` from the current directory. + +## Installation + +### Debian (for development) +Only **requirement** is `make`. Install by executing `sudo apt install make -y` in the terminal. +Installation of all required Python dependencies are done by executing `make install`. +This will **install system packages** using `apt`: + - `python` + - `python3-venv` + - `python3-pip` + *There is a big chance you already have them.* + After that: + 1. A Python environment will be created named `.venv`. *You can ignore this folder.* + 2. Required Python libraries will be installed. *(aiohttp, aiofiles, beautifullsoup4, app (molodetz)).* + 3. Downie will be installed. *(For the devs: in edit mode with -e!).* + 4. A executable file named `downie` will be placed in the current directory. This is the main application. +**Tip:** make downie globally accessable by copying `downie` to `/usr/local/bin`. + +### Other operating systems; Windows / Mac; not for development. +You have to figure out how to install python3, python3-venv and python3-pip on your OS. Make sure you have these on your system. Chance is big that you already got those. +Run the following commands: +```bash +python3 -m venv .venv +source .venv/bin/activate +pip install git+https://retoor.molodetz.nl/retoor/downie.git +cp .venv/bin/downie . +``` +**Tip:** make downie globally accessable by copying `downie` to `/usr/local/bin`. + +## Default configuration / internals +Not defined by CLI parameters, but configurable as Downie class constructor parameters: + - `concurrent_request_count` - Amount of concurrent requests. This defaults to 500. + - `concurrent_write_count` - Amount of concurrent downloads of files. This defaults to 10. +These values are based on my usage. There is a possibility that there are more optimial parameters. + +## Running a downloaded site: + +### Open site statically +Just open a page (.html) from the site using your browser. + +### Serve the site +Simple, run: +```bash +python3 -m http.server [port] +``` +If you don't know what port to use, use `7331`. Your website is accessable on `https://localhost:7331` now. + +## Caution +URLs of downloaded content can be modified to make the site portable. So it is possible that your downloaded site is literally the content online. It is optimized. A literal copy of the site could be unusable. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..95cfb9d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,34 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "downie" +version = "1.0.0" +description = "A fast async download parser with concurrency control. Written for specific use case to crawl a certain site. Will only download urls under the base url. Remote links are not followed." +readme = "README.md" +requires-python = ">=3.7" +dependencies = [ + "aiohttp", + "beautifulsoup4", + "aiofiles", + "app @ git+https://retoor.molodetz.nl/retoor/app.git" +] +authors = [ + { name = "retoor", email = "retoor@molodetz.nl" } +] + +[project.scripts] +downie = "downie.__main__:main" + +[project.optional-dependencies] +autocomplete = [ + "argcomplete>=1.12.3" +] + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] +