# retoor import asyncio import json import logging from typing import Any, Dict, List, Optional import dataset from devranta.api import Comment, Rant, UserProfile class DatabaseManager: def __init__(self, db_path: str, batch_size: int = 100, flush_interval: float = 5.0): self.db_path = db_path self.batch_size = batch_size self.flush_interval = flush_interval self._db: Optional[dataset.Database] = None self._rant_batch: List[Dict[str, Any]] = [] self._comment_batch: List[Dict[str, Any]] = [] self._user_batch: List[Dict[str, Any]] = [] self._flush_task: Optional[asyncio.Task] = None self._lock = asyncio.Lock() async def __aenter__(self): logging.info(f"Connecting to database at {self.db_path}...") self._db = dataset.connect( f"sqlite:///{self.db_path}?check_same_thread=False", engine_kwargs={"connect_args": {"check_same_thread": False}} ) await self._create_indexes() self._flush_task = asyncio.create_task(self._periodic_flush()) logging.info("Database connection successful.") return self async def __aexit__(self, exc_type, exc_val, exc_tb): if self._flush_task: self._flush_task.cancel() try: await self._flush_task except asyncio.CancelledError: pass await self.flush_all() if self._db: self._db.close() logging.info("Database connection closed.") async def _create_indexes(self): def _sync_create(): self._db.query("CREATE INDEX IF NOT EXISTS idx_rants_user_id ON rants(user_id)") self._db.query("CREATE INDEX IF NOT EXISTS idx_rants_created_time ON rants(created_time)") self._db.query("CREATE INDEX IF NOT EXISTS idx_comments_rant_id ON comments(rant_id)") self._db.query("CREATE INDEX IF NOT EXISTS idx_comments_user_id ON comments(user_id)") self._db.query("CREATE INDEX IF NOT EXISTS idx_users_username ON users(username)") await asyncio.to_thread(_sync_create) logging.info("Database indexes verified.") async def _periodic_flush(self): while True: await asyncio.sleep(self.flush_interval) await self.flush_all() async def flush_all(self): async with self._lock: await self._flush_rants() await self._flush_comments() await self._flush_users() async def _flush_rants(self): if not self._rant_batch: return batch = self._rant_batch.copy() self._rant_batch.clear() def _sync_insert(): table = self._db["rants"] for rant in batch: table.upsert(rant, ["id"]) await asyncio.to_thread(_sync_insert) logging.debug(f"Flushed {len(batch)} rants to database") async def _flush_comments(self): if not self._comment_batch: return batch = self._comment_batch.copy() self._comment_batch.clear() def _sync_insert(): table = self._db["comments"] for comment in batch: table.upsert(comment, ["id"]) await asyncio.to_thread(_sync_insert) logging.debug(f"Flushed {len(batch)} comments to database") async def _flush_users(self): if not self._user_batch: return batch = self._user_batch.copy() self._user_batch.clear() def _sync_insert(): table = self._db["users"] for user in batch: table.upsert(user, ["id"]) await asyncio.to_thread(_sync_insert) logging.debug(f"Flushed {len(batch)} users to database") def _transform_rant(self, rant: Rant) -> Dict[str, Any]: attached_image = rant.get("attached_image") image_url = None if isinstance(attached_image, dict): image_url = attached_image.get("url") elif isinstance(attached_image, str): image_url = attached_image tags = rant.get("tags", []) tags_str = json.dumps(tags) if tags else None return { "id": rant["id"], "user_id": rant["user_id"], "text": rant["text"], "score": rant["score"], "created_time": rant["created_time"], "num_comments": rant["num_comments"], "attached_image_url": image_url, "tags": tags_str, "link": rant.get("link"), "vote_state": rant.get("vote_state"), "user_username": rant.get("user_username"), "user_score": rant.get("user_score"), } def _transform_comment(self, comment: Comment) -> Dict[str, Any]: return { "id": comment["id"], "rant_id": comment["rant_id"], "user_id": comment["user_id"], "body": comment["body"], "score": comment["score"], "created_time": comment["created_time"], "vote_state": comment.get("vote_state"), "user_username": comment.get("user_username"), "user_score": comment.get("user_score"), } def _transform_user(self, user: UserProfile, user_id: int) -> Dict[str, Any]: return { "id": user_id, "username": user["username"], "score": user["score"], "about": user.get("about"), "location": user.get("location"), "created_time": user.get("created_time"), "skills": user.get("skills"), "github": user.get("github"), "website": user.get("website"), } async def add_rant(self, rant: Rant): async with self._lock: self._rant_batch.append(self._transform_rant(rant)) if len(self._rant_batch) >= self.batch_size: await self._flush_rants() async def add_comment(self, comment: Comment): async with self._lock: self._comment_batch.append(self._transform_comment(comment)) if len(self._comment_batch) >= self.batch_size: await self._flush_comments() async def add_user(self, user: UserProfile, user_id: int): async with self._lock: self._user_batch.append(self._transform_user(user, user_id)) if len(self._user_batch) >= self.batch_size: await self._flush_users() async def rant_exists(self, rant_id: int) -> bool: def _sync_check(): table = self._db["rants"] return table.find_one(id=rant_id) is not None return await asyncio.to_thread(_sync_check) async def user_exists(self, user_id: int) -> bool: def _sync_check(): table = self._db["users"] return table.find_one(id=user_id) is not None return await asyncio.to_thread(_sync_check) async def get_random_user_ids(self, limit: int) -> List[int]: logging.info(f"Fetching up to {limit} random user IDs from database for seeding...") def _sync_fetch(): result = self._db.query(f"SELECT id FROM users ORDER BY RANDOM() LIMIT {limit}") return [row["id"] for row in result] user_ids = await asyncio.to_thread(_sync_fetch) logging.info(f"Found {len(user_ids)} user IDs to seed.") return user_ids async def get_all_rant_ids(self) -> List[int]: def _sync_fetch(): result = self._db.query("SELECT id FROM rants") return [row["id"] for row in result] return await asyncio.to_thread(_sync_fetch) async def get_all_user_ids(self) -> List[int]: def _sync_fetch(): result = self._db.query("SELECT id FROM users") return [row["id"] for row in result] return await asyncio.to_thread(_sync_fetch) async def save_crawler_state(self, key: str, value: str): def _sync_save(): table = self._db["crawler_state"] table.upsert({"key": key, "value": value}, ["key"]) await asyncio.to_thread(_sync_save) async def load_crawler_state(self, key: str) -> Optional[str]: def _sync_load(): table = self._db["crawler_state"] row = table.find_one(key=key) return row["value"] if row else None return await asyncio.to_thread(_sync_load)