Source code and builds.

2025-02-12 14:50:44 +01:00 · 2025-02-12 14:50:44 +01:00 · 5038520e6d
commit 5038520e6d
parent 61957e8311
6 changed files with 213 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,4 @@
-downie*
+downie.*
 __pycache__/
 www*
 .venv
--- a/dist/downie-1.0.0-py3-none-any.whl
+++ b/dist/downie-1.0.0-py3-none-any.whl
--- a/dist/downie-1.0.0.tar.gz
+++ b/dist/downie-1.0.0.tar.gz
--- a/1
+++ b/1
@ -0,0 +1 @@
+.venv/bin/downie
--- a/src/downie/init.py
+++ b/src/downie/init.py
@ -0,0 +1,187 @@
+import aiohttp
+from app.app import Application as BaseApplication
+import asyncio
+from bs4 import BeautifulSoup
+import argparse
+import pathlib
+import logging 
+import aiofiles
+logger = logging.getLogger("downie")
+
+
+class Downie(BaseApplication):
+
+
+    def __init__(self, request_concurrency_limit=500,write_concurrency_limit=10,*args, **kwargs):
+        self.base_url = None
+        self.request_concurrency_limit = 500
+        self.write_concurrency_limit = 10
+        self.semaphore_write = asyncio.Semaphore(self.write_concurrency_limit)
+        self.semaphore_request = asyncio.Semaphore(self.request_concurrency_limit)
+        self.output_dir = pathlib.Path(".")
+        self.request_count = 0
+        self.redirect_limit = 5
+        self.links = set()
+        self._http_session = None
+        super().__init__(db_path="sqlite:///downie.db",*args, **kwargs)
+        self.db.query("PRAGMA synchronous = 0")
+        self.db.query("PRAGMA use_journal_mode = 0")
+        self.db.query("PRAGMA use_wal = 1")
+
+    @property
+    def http_session(self):
+        if not self._http_session:
+            self._http_session = aiohttp.ClientSession()
+        return self._http_session
+
+    async def is_registered(self, url):
+        if url == self.base_url:
+            return False
+        return len(list(await self.find('crawl',dict(url=url)))) > 0
+
+    async def set_in_progress(self, url):
+        return await self.upsert('crawl',dict(url=url,status=1),['url'])
+    
+    async def set_done(self,url,content):
+        if not content:
+            return
+        await self.upsert('crawl',dict(url=url,status=2),['url'])
+        local_path = await self.get_local_path(url)
+        if not local_path:
+            return None
+        path = pathlib.Path(local_path)
+        try:
+            if path.parent.name.endswith(".html"):
+                path = path.parent.parent.joinpath(path.parent.name.rstrip(".html")).join_path(path.name)
+            path.parent.mkdir(exist_ok=True,parents=True)
+        except Exception as ex:
+            logger.exception(ex)
+        if not path.is_dir():
+            if not path.exists():
+                logger.debug(f"Writing new file: {local_path}.")
+                async with self.semaphore_write:
+                    try:
+                        content = content.replace(b'"' + self.base_url.encode(),b'"')
+                        content = content.replace(b"'" + self.base_url.encode(),b"'")
+                        async with aiofiles.open(path, 'wb+') as file:
+                            await file.write(content)
+                    except Exception as ex:
+                        logger.exception(ex)
+
+            else:
+                logger.debug(f"Write cancelled, file already exists: {local_path}.")
+
+    async def get_local_path(self,url):
+        if url == self.base_url:
+            return None
+        local_dir = str(self.output_dir)
+        url = url.replace(self.base_url,"")
+        if url.startswith("/"):
+            url = url.lstrip("/")
+        #url_parts = url.split("/")
+        #for x in range(0,len(url_parts)- 1):
+            #url_parts[x] = 'd_' + url_parts[x]  
+        #url = "/".join(url_parts)
+        if not "." in url.split("/")[-1]:
+            url = url + ".html"
+        return local_dir + "/" + url
+
+    async def joinurl(self, url):
+        if url.startswith("javascript:"):
+            return None
+        if url.startswith("mailto"):
+            return None
+        if url.startswith("http"):
+            if url.startswith(self.base_url):
+                return url
+            return None
+        if url.startswith("/"):
+            return self.base_url + url 
+        return self.base_url + "/" + url
+
+    async def crawl(self, url):
+        urls = set()
+        if 'search' in url:
+            return False
+        if '#' in url:
+            return False
+        logger.debug("Current url: " + url)
+        await self.register_url(url)
+        content = await self.http_get(url) 
+        await self.set_done(url, content)
+        async for link in self.get_hrefs(content):
+            if not link in self.links:
+                self.links.add(link)
+                urls.add(link)
+        tasks = []
+        for url in urls:
+            tasks.append(self.crawl(url=url))
+        await asyncio.gather(*tasks)
+
+    async def close(self):
+        if self._http_session:
+            await self._http_session.close()
+            self._http_session = None 
+
+    async def register_url(self, url):
+        if not (await self.is_registered(url)):
+            await self.insert("crawl",dict(url=url,status=0))
+
+    async def get_hrefs(self, content):
+        if not content:
+            return 
+        try:
+            soup = BeautifulSoup(content, 'html.parser')
+            links = [a['href'] for a in soup.find_all('a', href=True)]
+            for link in links:
+                if link:
+                    link = await self.joinurl(link)
+                    if link:
+                        yield link
+        except Exception as ex:
+            logger.exception(ex)
+
+    async def http_get(self, url):
+        self.request_count += 1
+        cached = await self.find('content',dict(url=url))
+        if cached:
+            logger.debug(f"Request #{self.request_count} hitted cache: {url}.")
+            return cached[0].get('data')
+        else:
+            logger.debug(f"Request #{self.request_count} is to new url: {url}.")
+
+        async with self.semaphore_request: 
+            try:        
+                for x in range(self.redirect_limit):
+                    response = await self.http_session.get(url,allow_redirects=False)
+                    if response.status in (301, 302, 303, 307, 308):
+                        url = await self.joinurl(response.headers.get("Location"))
+                        if not url:
+                            return None 
+                        continue 
+
+                    response.raise_for_status()
+                    content = await response.read()
+                    await self.upsert('content',dict(url=url,data=content),['url'])
+                    logger.debug(f"Request #{self.request_count} is written to cache: {url}.")
+                    return content 
+            except Exception as ex:
+                logger.exception(ex)
+            return None
+
+    async def run_async(self, url):
+        self.base_url = url.strip("/")
+        if not self.base_url.startswith("http"):
+            raise ValueError("Base url should start with https.")
+        self.output_dir = self.base_url[self.base_url.find("//") + 2:] 
+        try:
+            await self.crawl(self.base_url)
+        finally:
+            await self.close()
+
+    def run(self,url):
+        asyncio.run(self.run_async(url=url))
+
+
+
+
--- a/src/downie/main.py
+++ b/src/downie/main.py
@ -0,0 +1,24 @@
+import argparse
+
+from downie import Downie
+
+def main():
+    argparser = argparse.ArgumentParser()
+
+    argparser.add_argument(
+        "url",
+        type=str
+    )
+    argparser.add_argument(
+        "-c",
+        help="Concurrency",
+        default=10,
+        type=int
+    )
+
+    args = argparser.parse_args()
+    downie = Downie()
+    downie.run(url=args.url)
+
+if __name__ == '__main__':
+    main()