diff --git a/.gitignore b/.gitignore index 08a196e..c8b0182 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -downie* +downie.* __pycache__/ www* .venv diff --git a/dist/downie-1.0.0-py3-none-any.whl b/dist/downie-1.0.0-py3-none-any.whl new file mode 100644 index 0000000..e98dbb2 Binary files /dev/null and b/dist/downie-1.0.0-py3-none-any.whl differ diff --git a/dist/downie-1.0.0.tar.gz b/dist/downie-1.0.0.tar.gz new file mode 100644 index 0000000..ab55ea5 Binary files /dev/null and b/dist/downie-1.0.0.tar.gz differ diff --git a/downie b/downie new file mode 120000 index 0000000..070c65d --- /dev/null +++ b/downie @@ -0,0 +1 @@ +.venv/bin/downie \ No newline at end of file diff --git a/src/downie/__init__.py b/src/downie/__init__.py new file mode 100644 index 0000000..ee0cfa8 --- /dev/null +++ b/src/downie/__init__.py @@ -0,0 +1,187 @@ +import aiohttp +from app.app import Application as BaseApplication +import asyncio +from bs4 import BeautifulSoup +import argparse +import pathlib +import logging +import aiofiles +logger = logging.getLogger("downie") + + +class Downie(BaseApplication): + + + def __init__(self, request_concurrency_limit=500,write_concurrency_limit=10,*args, **kwargs): + self.base_url = None + self.request_concurrency_limit = 500 + self.write_concurrency_limit = 10 + self.semaphore_write = asyncio.Semaphore(self.write_concurrency_limit) + self.semaphore_request = asyncio.Semaphore(self.request_concurrency_limit) + self.output_dir = pathlib.Path(".") + self.request_count = 0 + self.redirect_limit = 5 + self.links = set() + self._http_session = None + super().__init__(db_path="sqlite:///downie.db",*args, **kwargs) + self.db.query("PRAGMA synchronous = 0") + self.db.query("PRAGMA use_journal_mode = 0") + self.db.query("PRAGMA use_wal = 1") + + @property + def http_session(self): + if not self._http_session: + self._http_session = aiohttp.ClientSession() + return self._http_session + + async def is_registered(self, url): + if url == self.base_url: + return False + return len(list(await self.find('crawl',dict(url=url)))) > 0 + + async def set_in_progress(self, url): + return await self.upsert('crawl',dict(url=url,status=1),['url']) + + async def set_done(self,url,content): + if not content: + return + await self.upsert('crawl',dict(url=url,status=2),['url']) + local_path = await self.get_local_path(url) + if not local_path: + return None + path = pathlib.Path(local_path) + try: + if path.parent.name.endswith(".html"): + path = path.parent.parent.joinpath(path.parent.name.rstrip(".html")).join_path(path.name) + path.parent.mkdir(exist_ok=True,parents=True) + except Exception as ex: + logger.exception(ex) + if not path.is_dir(): + if not path.exists(): + logger.debug(f"Writing new file: {local_path}.") + async with self.semaphore_write: + try: + content = content.replace(b'"' + self.base_url.encode(),b'"') + content = content.replace(b"'" + self.base_url.encode(),b"'") + async with aiofiles.open(path, 'wb+') as file: + await file.write(content) + except Exception as ex: + logger.exception(ex) + + else: + logger.debug(f"Write cancelled, file already exists: {local_path}.") + + async def get_local_path(self,url): + if url == self.base_url: + return None + local_dir = str(self.output_dir) + url = url.replace(self.base_url,"") + if url.startswith("/"): + url = url.lstrip("/") + #url_parts = url.split("/") + #for x in range(0,len(url_parts)- 1): + #url_parts[x] = 'd_' + url_parts[x] + #url = "/".join(url_parts) + if not "." in url.split("/")[-1]: + url = url + ".html" + return local_dir + "/" + url + + async def joinurl(self, url): + if url.startswith("javascript:"): + return None + if url.startswith("mailto"): + return None + if url.startswith("http"): + if url.startswith(self.base_url): + return url + return None + if url.startswith("/"): + return self.base_url + url + return self.base_url + "/" + url + + async def crawl(self, url): + urls = set() + if 'search' in url: + return False + if '#' in url: + return False + logger.debug("Current url: " + url) + await self.register_url(url) + content = await self.http_get(url) + await self.set_done(url, content) + async for link in self.get_hrefs(content): + if not link in self.links: + self.links.add(link) + urls.add(link) + tasks = [] + for url in urls: + tasks.append(self.crawl(url=url)) + await asyncio.gather(*tasks) + + async def close(self): + if self._http_session: + await self._http_session.close() + self._http_session = None + + async def register_url(self, url): + if not (await self.is_registered(url)): + await self.insert("crawl",dict(url=url,status=0)) + + async def get_hrefs(self, content): + if not content: + return + try: + soup = BeautifulSoup(content, 'html.parser') + links = [a['href'] for a in soup.find_all('a', href=True)] + for link in links: + if link: + link = await self.joinurl(link) + if link: + yield link + except Exception as ex: + logger.exception(ex) + + async def http_get(self, url): + self.request_count += 1 + cached = await self.find('content',dict(url=url)) + if cached: + logger.debug(f"Request #{self.request_count} hitted cache: {url}.") + return cached[0].get('data') + else: + logger.debug(f"Request #{self.request_count} is to new url: {url}.") + + async with self.semaphore_request: + try: + for x in range(self.redirect_limit): + response = await self.http_session.get(url,allow_redirects=False) + if response.status in (301, 302, 303, 307, 308): + url = await self.joinurl(response.headers.get("Location")) + if not url: + return None + continue + + response.raise_for_status() + content = await response.read() + await self.upsert('content',dict(url=url,data=content),['url']) + logger.debug(f"Request #{self.request_count} is written to cache: {url}.") + return content + except Exception as ex: + logger.exception(ex) + return None + + async def run_async(self, url): + self.base_url = url.strip("/") + if not self.base_url.startswith("http"): + raise ValueError("Base url should start with https.") + self.output_dir = self.base_url[self.base_url.find("//") + 2:] + try: + await self.crawl(self.base_url) + finally: + await self.close() + + def run(self,url): + asyncio.run(self.run_async(url=url)) + + + + diff --git a/src/downie/__main__.py b/src/downie/__main__.py new file mode 100644 index 0000000..dd7cf13 --- /dev/null +++ b/src/downie/__main__.py @@ -0,0 +1,24 @@ +import argparse + +from downie import Downie + +def main(): + argparser = argparse.ArgumentParser() + + argparser.add_argument( + "url", + type=str + ) + argparser.add_argument( + "-c", + help="Concurrency", + default=10, + type=int + ) + + args = argparser.parse_args() + downie = Downie() + downie.run(url=args.url) + +if __name__ == '__main__': + main()