Source code and builds.

This commit is contained in:
retoor 2025-02-12 14:50:44 +01:00
parent 61957e8311
commit 5038520e6d
6 changed files with 213 additions and 1 deletions

2
.gitignore vendored
View File

@ -1,4 +1,4 @@
downie*
downie.*
__pycache__/
www*
.venv

BIN
dist/downie-1.0.0-py3-none-any.whl vendored Normal file

Binary file not shown.

BIN
dist/downie-1.0.0.tar.gz vendored Normal file

Binary file not shown.

1
downie Symbolic link
View File

@ -0,0 +1 @@
.venv/bin/downie

187
src/downie/__init__.py Normal file
View File

@ -0,0 +1,187 @@
import aiohttp
from app.app import Application as BaseApplication
import asyncio
from bs4 import BeautifulSoup
import argparse
import pathlib
import logging
import aiofiles
logger = logging.getLogger("downie")
class Downie(BaseApplication):
def __init__(self, request_concurrency_limit=500,write_concurrency_limit=10,*args, **kwargs):
self.base_url = None
self.request_concurrency_limit = 500
self.write_concurrency_limit = 10
self.semaphore_write = asyncio.Semaphore(self.write_concurrency_limit)
self.semaphore_request = asyncio.Semaphore(self.request_concurrency_limit)
self.output_dir = pathlib.Path(".")
self.request_count = 0
self.redirect_limit = 5
self.links = set()
self._http_session = None
super().__init__(db_path="sqlite:///downie.db",*args, **kwargs)
self.db.query("PRAGMA synchronous = 0")
self.db.query("PRAGMA use_journal_mode = 0")
self.db.query("PRAGMA use_wal = 1")
@property
def http_session(self):
if not self._http_session:
self._http_session = aiohttp.ClientSession()
return self._http_session
async def is_registered(self, url):
if url == self.base_url:
return False
return len(list(await self.find('crawl',dict(url=url)))) > 0
async def set_in_progress(self, url):
return await self.upsert('crawl',dict(url=url,status=1),['url'])
async def set_done(self,url,content):
if not content:
return
await self.upsert('crawl',dict(url=url,status=2),['url'])
local_path = await self.get_local_path(url)
if not local_path:
return None
path = pathlib.Path(local_path)
try:
if path.parent.name.endswith(".html"):
path = path.parent.parent.joinpath(path.parent.name.rstrip(".html")).join_path(path.name)
path.parent.mkdir(exist_ok=True,parents=True)
except Exception as ex:
logger.exception(ex)
if not path.is_dir():
if not path.exists():
logger.debug(f"Writing new file: {local_path}.")
async with self.semaphore_write:
try:
content = content.replace(b'"' + self.base_url.encode(),b'"')
content = content.replace(b"'" + self.base_url.encode(),b"'")
async with aiofiles.open(path, 'wb+') as file:
await file.write(content)
except Exception as ex:
logger.exception(ex)
else:
logger.debug(f"Write cancelled, file already exists: {local_path}.")
async def get_local_path(self,url):
if url == self.base_url:
return None
local_dir = str(self.output_dir)
url = url.replace(self.base_url,"")
if url.startswith("/"):
url = url.lstrip("/")
#url_parts = url.split("/")
#for x in range(0,len(url_parts)- 1):
#url_parts[x] = 'd_' + url_parts[x]
#url = "/".join(url_parts)
if not "." in url.split("/")[-1]:
url = url + ".html"
return local_dir + "/" + url
async def joinurl(self, url):
if url.startswith("javascript:"):
return None
if url.startswith("mailto"):
return None
if url.startswith("http"):
if url.startswith(self.base_url):
return url
return None
if url.startswith("/"):
return self.base_url + url
return self.base_url + "/" + url
async def crawl(self, url):
urls = set()
if 'search' in url:
return False
if '#' in url:
return False
logger.debug("Current url: " + url)
await self.register_url(url)
content = await self.http_get(url)
await self.set_done(url, content)
async for link in self.get_hrefs(content):
if not link in self.links:
self.links.add(link)
urls.add(link)
tasks = []
for url in urls:
tasks.append(self.crawl(url=url))
await asyncio.gather(*tasks)
async def close(self):
if self._http_session:
await self._http_session.close()
self._http_session = None
async def register_url(self, url):
if not (await self.is_registered(url)):
await self.insert("crawl",dict(url=url,status=0))
async def get_hrefs(self, content):
if not content:
return
try:
soup = BeautifulSoup(content, 'html.parser')
links = [a['href'] for a in soup.find_all('a', href=True)]
for link in links:
if link:
link = await self.joinurl(link)
if link:
yield link
except Exception as ex:
logger.exception(ex)
async def http_get(self, url):
self.request_count += 1
cached = await self.find('content',dict(url=url))
if cached:
logger.debug(f"Request #{self.request_count} hitted cache: {url}.")
return cached[0].get('data')
else:
logger.debug(f"Request #{self.request_count} is to new url: {url}.")
async with self.semaphore_request:
try:
for x in range(self.redirect_limit):
response = await self.http_session.get(url,allow_redirects=False)
if response.status in (301, 302, 303, 307, 308):
url = await self.joinurl(response.headers.get("Location"))
if not url:
return None
continue
response.raise_for_status()
content = await response.read()
await self.upsert('content',dict(url=url,data=content),['url'])
logger.debug(f"Request #{self.request_count} is written to cache: {url}.")
return content
except Exception as ex:
logger.exception(ex)
return None
async def run_async(self, url):
self.base_url = url.strip("/")
if not self.base_url.startswith("http"):
raise ValueError("Base url should start with https.")
self.output_dir = self.base_url[self.base_url.find("//") + 2:]
try:
await self.crawl(self.base_url)
finally:
await self.close()
def run(self,url):
asyncio.run(self.run_async(url=url))

24
src/downie/__main__.py Normal file
View File

@ -0,0 +1,24 @@
import argparse
from downie import Downie
def main():
argparser = argparse.ArgumentParser()
argparser.add_argument(
"url",
type=str
)
argparser.add_argument(
"-c",
help="Concurrency",
default=10,
type=int
)
args = argparser.parse_args()
downie = Downie()
downie.run(url=args.url)
if __name__ == '__main__':
main()