Source code and builds.
This commit is contained in:
parent
61957e8311
commit
5038520e6d
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,4 @@
|
||||
downie*
|
||||
downie.*
|
||||
__pycache__/
|
||||
www*
|
||||
.venv
|
||||
|
BIN
dist/downie-1.0.0-py3-none-any.whl
vendored
Normal file
BIN
dist/downie-1.0.0-py3-none-any.whl
vendored
Normal file
Binary file not shown.
BIN
dist/downie-1.0.0.tar.gz
vendored
Normal file
BIN
dist/downie-1.0.0.tar.gz
vendored
Normal file
Binary file not shown.
187
src/downie/__init__.py
Normal file
187
src/downie/__init__.py
Normal file
@ -0,0 +1,187 @@
|
||||
import aiohttp
|
||||
from app.app import Application as BaseApplication
|
||||
import asyncio
|
||||
from bs4 import BeautifulSoup
|
||||
import argparse
|
||||
import pathlib
|
||||
import logging
|
||||
import aiofiles
|
||||
logger = logging.getLogger("downie")
|
||||
|
||||
|
||||
class Downie(BaseApplication):
|
||||
|
||||
|
||||
def __init__(self, request_concurrency_limit=500,write_concurrency_limit=10,*args, **kwargs):
|
||||
self.base_url = None
|
||||
self.request_concurrency_limit = 500
|
||||
self.write_concurrency_limit = 10
|
||||
self.semaphore_write = asyncio.Semaphore(self.write_concurrency_limit)
|
||||
self.semaphore_request = asyncio.Semaphore(self.request_concurrency_limit)
|
||||
self.output_dir = pathlib.Path(".")
|
||||
self.request_count = 0
|
||||
self.redirect_limit = 5
|
||||
self.links = set()
|
||||
self._http_session = None
|
||||
super().__init__(db_path="sqlite:///downie.db",*args, **kwargs)
|
||||
self.db.query("PRAGMA synchronous = 0")
|
||||
self.db.query("PRAGMA use_journal_mode = 0")
|
||||
self.db.query("PRAGMA use_wal = 1")
|
||||
|
||||
@property
|
||||
def http_session(self):
|
||||
if not self._http_session:
|
||||
self._http_session = aiohttp.ClientSession()
|
||||
return self._http_session
|
||||
|
||||
async def is_registered(self, url):
|
||||
if url == self.base_url:
|
||||
return False
|
||||
return len(list(await self.find('crawl',dict(url=url)))) > 0
|
||||
|
||||
async def set_in_progress(self, url):
|
||||
return await self.upsert('crawl',dict(url=url,status=1),['url'])
|
||||
|
||||
async def set_done(self,url,content):
|
||||
if not content:
|
||||
return
|
||||
await self.upsert('crawl',dict(url=url,status=2),['url'])
|
||||
local_path = await self.get_local_path(url)
|
||||
if not local_path:
|
||||
return None
|
||||
path = pathlib.Path(local_path)
|
||||
try:
|
||||
if path.parent.name.endswith(".html"):
|
||||
path = path.parent.parent.joinpath(path.parent.name.rstrip(".html")).join_path(path.name)
|
||||
path.parent.mkdir(exist_ok=True,parents=True)
|
||||
except Exception as ex:
|
||||
logger.exception(ex)
|
||||
if not path.is_dir():
|
||||
if not path.exists():
|
||||
logger.debug(f"Writing new file: {local_path}.")
|
||||
async with self.semaphore_write:
|
||||
try:
|
||||
content = content.replace(b'"' + self.base_url.encode(),b'"')
|
||||
content = content.replace(b"'" + self.base_url.encode(),b"'")
|
||||
async with aiofiles.open(path, 'wb+') as file:
|
||||
await file.write(content)
|
||||
except Exception as ex:
|
||||
logger.exception(ex)
|
||||
|
||||
else:
|
||||
logger.debug(f"Write cancelled, file already exists: {local_path}.")
|
||||
|
||||
async def get_local_path(self,url):
|
||||
if url == self.base_url:
|
||||
return None
|
||||
local_dir = str(self.output_dir)
|
||||
url = url.replace(self.base_url,"")
|
||||
if url.startswith("/"):
|
||||
url = url.lstrip("/")
|
||||
#url_parts = url.split("/")
|
||||
#for x in range(0,len(url_parts)- 1):
|
||||
#url_parts[x] = 'd_' + url_parts[x]
|
||||
#url = "/".join(url_parts)
|
||||
if not "." in url.split("/")[-1]:
|
||||
url = url + ".html"
|
||||
return local_dir + "/" + url
|
||||
|
||||
async def joinurl(self, url):
|
||||
if url.startswith("javascript:"):
|
||||
return None
|
||||
if url.startswith("mailto"):
|
||||
return None
|
||||
if url.startswith("http"):
|
||||
if url.startswith(self.base_url):
|
||||
return url
|
||||
return None
|
||||
if url.startswith("/"):
|
||||
return self.base_url + url
|
||||
return self.base_url + "/" + url
|
||||
|
||||
async def crawl(self, url):
|
||||
urls = set()
|
||||
if 'search' in url:
|
||||
return False
|
||||
if '#' in url:
|
||||
return False
|
||||
logger.debug("Current url: " + url)
|
||||
await self.register_url(url)
|
||||
content = await self.http_get(url)
|
||||
await self.set_done(url, content)
|
||||
async for link in self.get_hrefs(content):
|
||||
if not link in self.links:
|
||||
self.links.add(link)
|
||||
urls.add(link)
|
||||
tasks = []
|
||||
for url in urls:
|
||||
tasks.append(self.crawl(url=url))
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
async def close(self):
|
||||
if self._http_session:
|
||||
await self._http_session.close()
|
||||
self._http_session = None
|
||||
|
||||
async def register_url(self, url):
|
||||
if not (await self.is_registered(url)):
|
||||
await self.insert("crawl",dict(url=url,status=0))
|
||||
|
||||
async def get_hrefs(self, content):
|
||||
if not content:
|
||||
return
|
||||
try:
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
links = [a['href'] for a in soup.find_all('a', href=True)]
|
||||
for link in links:
|
||||
if link:
|
||||
link = await self.joinurl(link)
|
||||
if link:
|
||||
yield link
|
||||
except Exception as ex:
|
||||
logger.exception(ex)
|
||||
|
||||
async def http_get(self, url):
|
||||
self.request_count += 1
|
||||
cached = await self.find('content',dict(url=url))
|
||||
if cached:
|
||||
logger.debug(f"Request #{self.request_count} hitted cache: {url}.")
|
||||
return cached[0].get('data')
|
||||
else:
|
||||
logger.debug(f"Request #{self.request_count} is to new url: {url}.")
|
||||
|
||||
async with self.semaphore_request:
|
||||
try:
|
||||
for x in range(self.redirect_limit):
|
||||
response = await self.http_session.get(url,allow_redirects=False)
|
||||
if response.status in (301, 302, 303, 307, 308):
|
||||
url = await self.joinurl(response.headers.get("Location"))
|
||||
if not url:
|
||||
return None
|
||||
continue
|
||||
|
||||
response.raise_for_status()
|
||||
content = await response.read()
|
||||
await self.upsert('content',dict(url=url,data=content),['url'])
|
||||
logger.debug(f"Request #{self.request_count} is written to cache: {url}.")
|
||||
return content
|
||||
except Exception as ex:
|
||||
logger.exception(ex)
|
||||
return None
|
||||
|
||||
async def run_async(self, url):
|
||||
self.base_url = url.strip("/")
|
||||
if not self.base_url.startswith("http"):
|
||||
raise ValueError("Base url should start with https.")
|
||||
self.output_dir = self.base_url[self.base_url.find("//") + 2:]
|
||||
try:
|
||||
await self.crawl(self.base_url)
|
||||
finally:
|
||||
await self.close()
|
||||
|
||||
def run(self,url):
|
||||
asyncio.run(self.run_async(url=url))
|
||||
|
||||
|
||||
|
||||
|
24
src/downie/__main__.py
Normal file
24
src/downie/__main__.py
Normal file
@ -0,0 +1,24 @@
|
||||
import argparse
|
||||
|
||||
from downie import Downie
|
||||
|
||||
def main():
|
||||
argparser = argparse.ArgumentParser()
|
||||
|
||||
argparser.add_argument(
|
||||
"url",
|
||||
type=str
|
||||
)
|
||||
argparser.add_argument(
|
||||
"-c",
|
||||
help="Concurrency",
|
||||
default=10,
|
||||
type=int
|
||||
)
|
||||
|
||||
args = argparser.parse_args()
|
||||
downie = Downie()
|
||||
downie.run(url=args.url)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue
Block a user