Source code and builds.
This commit is contained in:
parent
61957e8311
commit
5038520e6d
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,4 @@
|
|||||||
downie*
|
downie.*
|
||||||
__pycache__/
|
__pycache__/
|
||||||
www*
|
www*
|
||||||
.venv
|
.venv
|
||||||
|
BIN
dist/downie-1.0.0-py3-none-any.whl
vendored
Normal file
BIN
dist/downie-1.0.0-py3-none-any.whl
vendored
Normal file
Binary file not shown.
BIN
dist/downie-1.0.0.tar.gz
vendored
Normal file
BIN
dist/downie-1.0.0.tar.gz
vendored
Normal file
Binary file not shown.
187
src/downie/__init__.py
Normal file
187
src/downie/__init__.py
Normal file
@ -0,0 +1,187 @@
|
|||||||
|
import aiohttp
|
||||||
|
from app.app import Application as BaseApplication
|
||||||
|
import asyncio
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import argparse
|
||||||
|
import pathlib
|
||||||
|
import logging
|
||||||
|
import aiofiles
|
||||||
|
logger = logging.getLogger("downie")
|
||||||
|
|
||||||
|
|
||||||
|
class Downie(BaseApplication):
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, request_concurrency_limit=500,write_concurrency_limit=10,*args, **kwargs):
|
||||||
|
self.base_url = None
|
||||||
|
self.request_concurrency_limit = 500
|
||||||
|
self.write_concurrency_limit = 10
|
||||||
|
self.semaphore_write = asyncio.Semaphore(self.write_concurrency_limit)
|
||||||
|
self.semaphore_request = asyncio.Semaphore(self.request_concurrency_limit)
|
||||||
|
self.output_dir = pathlib.Path(".")
|
||||||
|
self.request_count = 0
|
||||||
|
self.redirect_limit = 5
|
||||||
|
self.links = set()
|
||||||
|
self._http_session = None
|
||||||
|
super().__init__(db_path="sqlite:///downie.db",*args, **kwargs)
|
||||||
|
self.db.query("PRAGMA synchronous = 0")
|
||||||
|
self.db.query("PRAGMA use_journal_mode = 0")
|
||||||
|
self.db.query("PRAGMA use_wal = 1")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def http_session(self):
|
||||||
|
if not self._http_session:
|
||||||
|
self._http_session = aiohttp.ClientSession()
|
||||||
|
return self._http_session
|
||||||
|
|
||||||
|
async def is_registered(self, url):
|
||||||
|
if url == self.base_url:
|
||||||
|
return False
|
||||||
|
return len(list(await self.find('crawl',dict(url=url)))) > 0
|
||||||
|
|
||||||
|
async def set_in_progress(self, url):
|
||||||
|
return await self.upsert('crawl',dict(url=url,status=1),['url'])
|
||||||
|
|
||||||
|
async def set_done(self,url,content):
|
||||||
|
if not content:
|
||||||
|
return
|
||||||
|
await self.upsert('crawl',dict(url=url,status=2),['url'])
|
||||||
|
local_path = await self.get_local_path(url)
|
||||||
|
if not local_path:
|
||||||
|
return None
|
||||||
|
path = pathlib.Path(local_path)
|
||||||
|
try:
|
||||||
|
if path.parent.name.endswith(".html"):
|
||||||
|
path = path.parent.parent.joinpath(path.parent.name.rstrip(".html")).join_path(path.name)
|
||||||
|
path.parent.mkdir(exist_ok=True,parents=True)
|
||||||
|
except Exception as ex:
|
||||||
|
logger.exception(ex)
|
||||||
|
if not path.is_dir():
|
||||||
|
if not path.exists():
|
||||||
|
logger.debug(f"Writing new file: {local_path}.")
|
||||||
|
async with self.semaphore_write:
|
||||||
|
try:
|
||||||
|
content = content.replace(b'"' + self.base_url.encode(),b'"')
|
||||||
|
content = content.replace(b"'" + self.base_url.encode(),b"'")
|
||||||
|
async with aiofiles.open(path, 'wb+') as file:
|
||||||
|
await file.write(content)
|
||||||
|
except Exception as ex:
|
||||||
|
logger.exception(ex)
|
||||||
|
|
||||||
|
else:
|
||||||
|
logger.debug(f"Write cancelled, file already exists: {local_path}.")
|
||||||
|
|
||||||
|
async def get_local_path(self,url):
|
||||||
|
if url == self.base_url:
|
||||||
|
return None
|
||||||
|
local_dir = str(self.output_dir)
|
||||||
|
url = url.replace(self.base_url,"")
|
||||||
|
if url.startswith("/"):
|
||||||
|
url = url.lstrip("/")
|
||||||
|
#url_parts = url.split("/")
|
||||||
|
#for x in range(0,len(url_parts)- 1):
|
||||||
|
#url_parts[x] = 'd_' + url_parts[x]
|
||||||
|
#url = "/".join(url_parts)
|
||||||
|
if not "." in url.split("/")[-1]:
|
||||||
|
url = url + ".html"
|
||||||
|
return local_dir + "/" + url
|
||||||
|
|
||||||
|
async def joinurl(self, url):
|
||||||
|
if url.startswith("javascript:"):
|
||||||
|
return None
|
||||||
|
if url.startswith("mailto"):
|
||||||
|
return None
|
||||||
|
if url.startswith("http"):
|
||||||
|
if url.startswith(self.base_url):
|
||||||
|
return url
|
||||||
|
return None
|
||||||
|
if url.startswith("/"):
|
||||||
|
return self.base_url + url
|
||||||
|
return self.base_url + "/" + url
|
||||||
|
|
||||||
|
async def crawl(self, url):
|
||||||
|
urls = set()
|
||||||
|
if 'search' in url:
|
||||||
|
return False
|
||||||
|
if '#' in url:
|
||||||
|
return False
|
||||||
|
logger.debug("Current url: " + url)
|
||||||
|
await self.register_url(url)
|
||||||
|
content = await self.http_get(url)
|
||||||
|
await self.set_done(url, content)
|
||||||
|
async for link in self.get_hrefs(content):
|
||||||
|
if not link in self.links:
|
||||||
|
self.links.add(link)
|
||||||
|
urls.add(link)
|
||||||
|
tasks = []
|
||||||
|
for url in urls:
|
||||||
|
tasks.append(self.crawl(url=url))
|
||||||
|
await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
if self._http_session:
|
||||||
|
await self._http_session.close()
|
||||||
|
self._http_session = None
|
||||||
|
|
||||||
|
async def register_url(self, url):
|
||||||
|
if not (await self.is_registered(url)):
|
||||||
|
await self.insert("crawl",dict(url=url,status=0))
|
||||||
|
|
||||||
|
async def get_hrefs(self, content):
|
||||||
|
if not content:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
links = [a['href'] for a in soup.find_all('a', href=True)]
|
||||||
|
for link in links:
|
||||||
|
if link:
|
||||||
|
link = await self.joinurl(link)
|
||||||
|
if link:
|
||||||
|
yield link
|
||||||
|
except Exception as ex:
|
||||||
|
logger.exception(ex)
|
||||||
|
|
||||||
|
async def http_get(self, url):
|
||||||
|
self.request_count += 1
|
||||||
|
cached = await self.find('content',dict(url=url))
|
||||||
|
if cached:
|
||||||
|
logger.debug(f"Request #{self.request_count} hitted cache: {url}.")
|
||||||
|
return cached[0].get('data')
|
||||||
|
else:
|
||||||
|
logger.debug(f"Request #{self.request_count} is to new url: {url}.")
|
||||||
|
|
||||||
|
async with self.semaphore_request:
|
||||||
|
try:
|
||||||
|
for x in range(self.redirect_limit):
|
||||||
|
response = await self.http_session.get(url,allow_redirects=False)
|
||||||
|
if response.status in (301, 302, 303, 307, 308):
|
||||||
|
url = await self.joinurl(response.headers.get("Location"))
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
continue
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
content = await response.read()
|
||||||
|
await self.upsert('content',dict(url=url,data=content),['url'])
|
||||||
|
logger.debug(f"Request #{self.request_count} is written to cache: {url}.")
|
||||||
|
return content
|
||||||
|
except Exception as ex:
|
||||||
|
logger.exception(ex)
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def run_async(self, url):
|
||||||
|
self.base_url = url.strip("/")
|
||||||
|
if not self.base_url.startswith("http"):
|
||||||
|
raise ValueError("Base url should start with https.")
|
||||||
|
self.output_dir = self.base_url[self.base_url.find("//") + 2:]
|
||||||
|
try:
|
||||||
|
await self.crawl(self.base_url)
|
||||||
|
finally:
|
||||||
|
await self.close()
|
||||||
|
|
||||||
|
def run(self,url):
|
||||||
|
asyncio.run(self.run_async(url=url))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
24
src/downie/__main__.py
Normal file
24
src/downie/__main__.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import argparse
|
||||||
|
|
||||||
|
from downie import Downie
|
||||||
|
|
||||||
|
def main():
|
||||||
|
argparser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
argparser.add_argument(
|
||||||
|
"url",
|
||||||
|
type=str
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
"-c",
|
||||||
|
help="Concurrency",
|
||||||
|
default=10,
|
||||||
|
type=int
|
||||||
|
)
|
||||||
|
|
||||||
|
args = argparser.parse_args()
|
||||||
|
downie = Downie()
|
||||||
|
downie.run(url=args.url)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user