import asyncio import logging from typing import Set from database import DatabaseManager from devranta.api import Api, Rant class DevRantCrawler: def __init__( self, api: Api, db: DatabaseManager, rant_consumers: int, user_consumers: int ): self.api = api self.db = db self.rant_queue = asyncio.Queue(maxsize=1000000) self.user_queue = asyncio.Queue(maxsize=1000000) self.shutdown_event = asyncio.Event() self.num_rant_consumers = rant_consumers self.num_user_consumers = user_consumers self.seen_rant_ids: Set[int] = set() self.seen_user_ids: Set[int] = set() self.stats = { "rants_processed": 0, "rants_added_to_db": 0, "comments_added_to_db": 0, "users_processed": 0, "users_added_to_db": 0, "api_errors": 0, "producer_loops": 0, "end_of_feed_hits": 0, "rants_queued": 0, "users_queued": 0, } async def _queue_user_if_new(self, user_id: int): if user_id in self.seen_user_ids: return self.seen_user_ids.add(user_id) if not await self.db.user_exists(user_id): await self.user_queue.put(user_id) self.stats["users_queued"] += 1 async def _queue_rant_if_new(self, rant_obj: Rant): rant_id = rant_obj["id"] if rant_id in self.seen_rant_ids: return self.seen_rant_ids.add(rant_id) if not await self.db.rant_exists(rant_id): await self.db.add_rant(rant_obj) self.stats["rants_added_to_db"] += 1 await self.rant_queue.put(rant_id) self.stats["rants_queued"] += 1 async def _initial_seed(self): logging.info("Starting initial seeder to re-ignite crawling process...") user_ids = await self.db.get_random_user_ids(limit=2000) if not user_ids: logging.info( "Seeder found no existing users. Crawler will start from scratch." ) return for user_id in user_ids: if user_id not in self.seen_user_ids: self.seen_user_ids.add(user_id) await self.user_queue.put(user_id) self.stats["users_queued"] += 1 logging.info( f"Seeder finished: Queued {len(user_ids)} users to kickstart exploration." ) async def _rant_producer(self): logging.info("Rant producer started.") skip = 0 consecutive_empty_responses = 0 while not self.shutdown_event.is_set(): try: logging.info(f"Producer: Fetching rants with skip={skip}...") rants = await self.api.get_rants(sort="recent", limit=50, skip=skip) self.stats["producer_loops"] += 1 if not rants: consecutive_empty_responses += 1 logging.info( f"Producer: Feed returned empty. Consecutive empty hits: {consecutive_empty_responses}." ) if consecutive_empty_responses >= 5: self.stats["end_of_feed_hits"] += 1 logging.info( "Producer: End of feed likely reached. Pausing for 15 minutes before reset." ) await asyncio.sleep(900) skip = 0 consecutive_empty_responses = 0 else: await asyncio.sleep(10) continue consecutive_empty_responses = 0 new_rants_found = 0 for rant in rants: await self._queue_rant_if_new(rant) new_rants_found += 1 logging.info( f"Producer: Processed {new_rants_found} rants from feed. Total queued: {self.stats['rants_queued']}." ) skip += len(rants) await asyncio.sleep(2) except Exception as e: logging.critical( f"Producer: Unhandled exception: {e}. Retrying in 60s." ) self.stats["api_errors"] += 1 await asyncio.sleep(60) async def _rant_consumer(self, worker_id: int): logging.info(f"Rant consumer #{worker_id} started.") while not self.shutdown_event.is_set(): try: rant_id = await self.rant_queue.get() logging.info( f"Rant consumer #{worker_id}: Processing rant ID {rant_id}." ) rant_details = await self.api.get_rant(rant_id) if not rant_details or not rant_details.get("success"): logging.warning( f"Rant consumer #{worker_id}: Failed to fetch details for rant {rant_id}." ) self.rant_queue.task_done() continue await self._queue_user_if_new(rant_details["rant"]["user_id"]) comments = rant_details.get("comments", []) for comment in comments: await self.db.add_comment(comment) self.stats["comments_added_to_db"] += 1 await self._queue_user_if_new(comment["user_id"]) logging.info( f"Rant consumer #{worker_id}: Finished processing rant {rant_id}, found {len(comments)} comments." ) self.stats["rants_processed"] += 1 self.rant_queue.task_done() except Exception as e: logging.error(f"Rant consumer #{worker_id}: Unhandled exception: {e}") self.rant_queue.task_done() async def _user_consumer(self, worker_id: int): logging.info(f"User consumer #{worker_id} started.") while not self.shutdown_event.is_set(): try: user_id = await self.user_queue.get() logging.info( f"User consumer #{worker_id}: Processing user ID {user_id}." ) profile = await self.api.get_profile(user_id) if not profile: logging.warning( f"User consumer #{worker_id}: Could not fetch profile for user {user_id}." ) self.user_queue.task_done() continue await self.db.add_user(profile, user_id) self.stats["users_added_to_db"] += 1 rants_found_on_profile = 0 content_sections = profile.get("content", {}).get("content", {}) for section_name in ["rants", "upvoted", "favorites"]: for rant_obj in content_sections.get(section_name, []): await self._queue_rant_if_new(rant_obj) rants_found_on_profile += 1 logging.info( f"User consumer #{worker_id}: Finished user {user_id}, found and queued {rants_found_on_profile} associated rants." ) self.stats["users_processed"] += 1 self.user_queue.task_done() except Exception as e: logging.error(f"User consumer #{worker_id}: Unhandled exception: {e}") self.user_queue.task_done() async def _stats_reporter(self): logging.info("Stats reporter started.") while not self.shutdown_event.is_set(): await asyncio.sleep(15) logging.info( f"[STATS] Rants Q'd/Proc: {self.stats['rants_queued']}/{self.stats['rants_processed']} | " f"Users Q'd/Proc: {self.stats['users_queued']}/{self.stats['users_processed']} | " f"Comments DB: {self.stats['comments_added_to_db']} | " f"Queues (R/U): {self.rant_queue.qsize()}/{self.user_queue.qsize()} | " f"API Errors: {self.stats['api_errors']}" ) async def run(self): logging.info("Exhaustive crawler starting...") await self._initial_seed() logging.info("Starting main producer and consumer tasks...") tasks = [] try: tasks.append(asyncio.create_task(self._rant_producer())) tasks.append(asyncio.create_task(self._stats_reporter())) for i in range(self.num_rant_consumers): tasks.append(asyncio.create_task(self._rant_consumer(i + 1))) for i in range(self.num_user_consumers): tasks.append(asyncio.create_task(self._user_consumer(i + 1))) await asyncio.gather(*tasks, return_exceptions=True) except asyncio.CancelledError: logging.info("Crawler run cancelled.") finally: await self.shutdown() async def shutdown(self): if self.shutdown_event.is_set(): return logging.info("Shutting down... sending signal to all tasks.") self.shutdown_event.set() logging.info("Waiting for queues to empty... Press Ctrl+C again to force exit.") try: await asyncio.wait_for(self.rant_queue.join(), timeout=30) await asyncio.wait_for(self.user_queue.join(), timeout=30) except (asyncio.TimeoutError, asyncio.CancelledError): logging.warning("Could not empty queues in time, proceeding with shutdown.") tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()] for task in tasks: task.cancel() await asyncio.gather(*tasks, return_exceptions=True) logging.info("All tasks cancelled.") logging.info(f"--- FINAL STATS ---\n{self.stats}")