2025-08-03 00:40:34 +02:00
|
|
|
import asyncio
|
|
|
|
import logging
|
|
|
|
from typing import Set
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
from database import DatabaseManager
|
|
|
|
|
2025-08-13 00:06:44 +02:00
|
|
|
from devranta.api import Api, Rant
|
|
|
|
|
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
class DevRantCrawler:
|
2025-08-13 00:06:44 +02:00
|
|
|
def __init__(
|
|
|
|
self, api: Api, db: DatabaseManager, rant_consumers: int, user_consumers: int
|
|
|
|
):
|
2025-08-03 00:40:34 +02:00
|
|
|
self.api = api
|
|
|
|
self.db = db
|
|
|
|
self.rant_queue = asyncio.Queue(maxsize=1000000)
|
|
|
|
self.user_queue = asyncio.Queue(maxsize=1000000)
|
|
|
|
self.shutdown_event = asyncio.Event()
|
|
|
|
|
|
|
|
self.num_rant_consumers = rant_consumers
|
|
|
|
self.num_user_consumers = user_consumers
|
|
|
|
|
|
|
|
self.seen_rant_ids: Set[int] = set()
|
|
|
|
self.seen_user_ids: Set[int] = set()
|
|
|
|
self.stats = {
|
2025-08-13 00:06:44 +02:00
|
|
|
"rants_processed": 0,
|
|
|
|
"rants_added_to_db": 0,
|
|
|
|
"comments_added_to_db": 0,
|
|
|
|
"users_processed": 0,
|
|
|
|
"users_added_to_db": 0,
|
|
|
|
"api_errors": 0,
|
|
|
|
"producer_loops": 0,
|
|
|
|
"end_of_feed_hits": 0,
|
|
|
|
"rants_queued": 0,
|
|
|
|
"users_queued": 0,
|
2025-08-03 00:40:34 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
async def _queue_user_if_new(self, user_id: int):
|
|
|
|
if user_id in self.seen_user_ids:
|
|
|
|
return
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
self.seen_user_ids.add(user_id)
|
|
|
|
if not await self.db.user_exists(user_id):
|
|
|
|
await self.user_queue.put(user_id)
|
|
|
|
self.stats["users_queued"] += 1
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
async def _queue_rant_if_new(self, rant_obj: Rant):
|
2025-08-13 00:06:44 +02:00
|
|
|
rant_id = rant_obj["id"]
|
2025-08-03 00:40:34 +02:00
|
|
|
if rant_id in self.seen_rant_ids:
|
|
|
|
return
|
|
|
|
|
|
|
|
self.seen_rant_ids.add(rant_id)
|
|
|
|
if not await self.db.rant_exists(rant_id):
|
|
|
|
await self.db.add_rant(rant_obj)
|
|
|
|
self.stats["rants_added_to_db"] += 1
|
|
|
|
await self.rant_queue.put(rant_id)
|
|
|
|
self.stats["rants_queued"] += 1
|
|
|
|
|
|
|
|
async def _initial_seed(self):
|
|
|
|
logging.info("Starting initial seeder to re-ignite crawling process...")
|
|
|
|
user_ids = await self.db.get_random_user_ids(limit=2000)
|
|
|
|
if not user_ids:
|
2025-08-13 00:06:44 +02:00
|
|
|
logging.info(
|
|
|
|
"Seeder found no existing users. Crawler will start from scratch."
|
|
|
|
)
|
2025-08-03 00:40:34 +02:00
|
|
|
return
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
for user_id in user_ids:
|
|
|
|
if user_id not in self.seen_user_ids:
|
|
|
|
self.seen_user_ids.add(user_id)
|
|
|
|
await self.user_queue.put(user_id)
|
|
|
|
self.stats["users_queued"] += 1
|
2025-08-13 00:06:44 +02:00
|
|
|
logging.info(
|
|
|
|
f"Seeder finished: Queued {len(user_ids)} users to kickstart exploration."
|
|
|
|
)
|
2025-08-03 00:40:34 +02:00
|
|
|
|
|
|
|
async def _rant_producer(self):
|
|
|
|
logging.info("Rant producer started.")
|
|
|
|
skip = 0
|
|
|
|
consecutive_empty_responses = 0
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
while not self.shutdown_event.is_set():
|
|
|
|
try:
|
|
|
|
logging.info(f"Producer: Fetching rants with skip={skip}...")
|
|
|
|
rants = await self.api.get_rants(sort="recent", limit=50, skip=skip)
|
|
|
|
self.stats["producer_loops"] += 1
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
if not rants:
|
|
|
|
consecutive_empty_responses += 1
|
2025-08-13 00:06:44 +02:00
|
|
|
logging.info(
|
|
|
|
f"Producer: Feed returned empty. Consecutive empty hits: {consecutive_empty_responses}."
|
|
|
|
)
|
2025-08-03 00:40:34 +02:00
|
|
|
if consecutive_empty_responses >= 5:
|
|
|
|
self.stats["end_of_feed_hits"] += 1
|
2025-08-13 00:06:44 +02:00
|
|
|
logging.info(
|
|
|
|
"Producer: End of feed likely reached. Pausing for 15 minutes before reset."
|
|
|
|
)
|
2025-08-03 00:40:34 +02:00
|
|
|
await asyncio.sleep(900)
|
|
|
|
skip = 0
|
|
|
|
consecutive_empty_responses = 0
|
|
|
|
else:
|
|
|
|
await asyncio.sleep(10)
|
|
|
|
continue
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
consecutive_empty_responses = 0
|
|
|
|
new_rants_found = 0
|
|
|
|
for rant in rants:
|
|
|
|
await self._queue_rant_if_new(rant)
|
|
|
|
new_rants_found += 1
|
2025-08-13 00:06:44 +02:00
|
|
|
|
|
|
|
logging.info(
|
|
|
|
f"Producer: Processed {new_rants_found} rants from feed. Total queued: {self.stats['rants_queued']}."
|
|
|
|
)
|
2025-08-03 00:40:34 +02:00
|
|
|
skip += len(rants)
|
|
|
|
await asyncio.sleep(2)
|
|
|
|
|
|
|
|
except Exception as e:
|
2025-08-13 00:06:44 +02:00
|
|
|
logging.critical(
|
|
|
|
f"Producer: Unhandled exception: {e}. Retrying in 60s."
|
|
|
|
)
|
2025-08-03 00:40:34 +02:00
|
|
|
self.stats["api_errors"] += 1
|
|
|
|
await asyncio.sleep(60)
|
|
|
|
|
|
|
|
async def _rant_consumer(self, worker_id: int):
|
|
|
|
logging.info(f"Rant consumer #{worker_id} started.")
|
|
|
|
while not self.shutdown_event.is_set():
|
|
|
|
try:
|
|
|
|
rant_id = await self.rant_queue.get()
|
2025-08-13 00:06:44 +02:00
|
|
|
logging.info(
|
|
|
|
f"Rant consumer #{worker_id}: Processing rant ID {rant_id}."
|
|
|
|
)
|
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
rant_details = await self.api.get_rant(rant_id)
|
|
|
|
if not rant_details or not rant_details.get("success"):
|
2025-08-13 00:06:44 +02:00
|
|
|
logging.warning(
|
|
|
|
f"Rant consumer #{worker_id}: Failed to fetch details for rant {rant_id}."
|
|
|
|
)
|
2025-08-03 00:40:34 +02:00
|
|
|
self.rant_queue.task_done()
|
|
|
|
continue
|
|
|
|
|
2025-08-13 00:06:44 +02:00
|
|
|
await self._queue_user_if_new(rant_details["rant"]["user_id"])
|
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
comments = rant_details.get("comments", [])
|
|
|
|
for comment in comments:
|
|
|
|
await self.db.add_comment(comment)
|
|
|
|
self.stats["comments_added_to_db"] += 1
|
2025-08-13 00:06:44 +02:00
|
|
|
await self._queue_user_if_new(comment["user_id"])
|
|
|
|
|
|
|
|
logging.info(
|
|
|
|
f"Rant consumer #{worker_id}: Finished processing rant {rant_id}, found {len(comments)} comments."
|
|
|
|
)
|
2025-08-03 00:40:34 +02:00
|
|
|
self.stats["rants_processed"] += 1
|
|
|
|
self.rant_queue.task_done()
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logging.error(f"Rant consumer #{worker_id}: Unhandled exception: {e}")
|
|
|
|
self.rant_queue.task_done()
|
|
|
|
|
|
|
|
async def _user_consumer(self, worker_id: int):
|
|
|
|
logging.info(f"User consumer #{worker_id} started.")
|
|
|
|
while not self.shutdown_event.is_set():
|
|
|
|
try:
|
|
|
|
user_id = await self.user_queue.get()
|
2025-08-13 00:06:44 +02:00
|
|
|
logging.info(
|
|
|
|
f"User consumer #{worker_id}: Processing user ID {user_id}."
|
|
|
|
)
|
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
profile = await self.api.get_profile(user_id)
|
|
|
|
if not profile:
|
2025-08-13 00:06:44 +02:00
|
|
|
logging.warning(
|
|
|
|
f"User consumer #{worker_id}: Could not fetch profile for user {user_id}."
|
|
|
|
)
|
2025-08-03 00:40:34 +02:00
|
|
|
self.user_queue.task_done()
|
|
|
|
continue
|
|
|
|
|
|
|
|
await self.db.add_user(profile, user_id)
|
|
|
|
self.stats["users_added_to_db"] += 1
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
rants_found_on_profile = 0
|
|
|
|
content_sections = profile.get("content", {}).get("content", {})
|
|
|
|
for section_name in ["rants", "upvoted", "favorites"]:
|
|
|
|
for rant_obj in content_sections.get(section_name, []):
|
|
|
|
await self._queue_rant_if_new(rant_obj)
|
|
|
|
rants_found_on_profile += 1
|
|
|
|
|
2025-08-13 00:06:44 +02:00
|
|
|
logging.info(
|
|
|
|
f"User consumer #{worker_id}: Finished user {user_id}, found and queued {rants_found_on_profile} associated rants."
|
|
|
|
)
|
2025-08-03 00:40:34 +02:00
|
|
|
self.stats["users_processed"] += 1
|
|
|
|
self.user_queue.task_done()
|
|
|
|
except Exception as e:
|
|
|
|
logging.error(f"User consumer #{worker_id}: Unhandled exception: {e}")
|
|
|
|
self.user_queue.task_done()
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
async def _stats_reporter(self):
|
|
|
|
logging.info("Stats reporter started.")
|
|
|
|
while not self.shutdown_event.is_set():
|
|
|
|
await asyncio.sleep(15)
|
|
|
|
logging.info(
|
|
|
|
f"[STATS] Rants Q'd/Proc: {self.stats['rants_queued']}/{self.stats['rants_processed']} | "
|
|
|
|
f"Users Q'd/Proc: {self.stats['users_queued']}/{self.stats['users_processed']} | "
|
|
|
|
f"Comments DB: {self.stats['comments_added_to_db']} | "
|
|
|
|
f"Queues (R/U): {self.rant_queue.qsize()}/{self.user_queue.qsize()} | "
|
|
|
|
f"API Errors: {self.stats['api_errors']}"
|
|
|
|
)
|
|
|
|
|
|
|
|
async def run(self):
|
|
|
|
logging.info("Exhaustive crawler starting...")
|
|
|
|
await self._initial_seed()
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
logging.info("Starting main producer and consumer tasks...")
|
|
|
|
tasks = []
|
|
|
|
try:
|
|
|
|
tasks.append(asyncio.create_task(self._rant_producer()))
|
|
|
|
tasks.append(asyncio.create_task(self._stats_reporter()))
|
|
|
|
|
|
|
|
for i in range(self.num_rant_consumers):
|
|
|
|
tasks.append(asyncio.create_task(self._rant_consumer(i + 1)))
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
for i in range(self.num_user_consumers):
|
|
|
|
tasks.append(asyncio.create_task(self._user_consumer(i + 1)))
|
|
|
|
|
|
|
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
except asyncio.CancelledError:
|
|
|
|
logging.info("Crawler run cancelled.")
|
|
|
|
finally:
|
|
|
|
await self.shutdown()
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
async def shutdown(self):
|
|
|
|
if self.shutdown_event.is_set():
|
|
|
|
return
|
|
|
|
logging.info("Shutting down... sending signal to all tasks.")
|
|
|
|
self.shutdown_event.set()
|
|
|
|
|
|
|
|
logging.info("Waiting for queues to empty... Press Ctrl+C again to force exit.")
|
|
|
|
try:
|
|
|
|
await asyncio.wait_for(self.rant_queue.join(), timeout=30)
|
|
|
|
await asyncio.wait_for(self.user_queue.join(), timeout=30)
|
|
|
|
except (asyncio.TimeoutError, asyncio.CancelledError):
|
|
|
|
logging.warning("Could not empty queues in time, proceeding with shutdown.")
|
|
|
|
|
|
|
|
tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
|
|
|
|
for task in tasks:
|
|
|
|
task.cancel()
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
logging.info("All tasks cancelled.")
|
|
|
|
logging.info(f"--- FINAL STATS ---\n{self.stats}")
|