227 lines
8.3 KiB
Python
Raw Normal View History

2025-12-28 06:03:12 +01:00
# retoor <retoor@molodetz.nl>
import asyncio
import json
2025-08-03 00:40:34 +02:00
import logging
2025-12-28 06:03:12 +01:00
from typing import Any, Dict, List, Optional
2025-08-13 00:06:44 +02:00
2025-12-28 06:03:12 +01:00
import dataset
2025-08-13 00:06:44 +02:00
from devranta.api import Comment, Rant, UserProfile
2025-08-03 00:40:34 +02:00
class DatabaseManager:
2025-12-28 06:03:12 +01:00
def __init__(self, db_path: str, batch_size: int = 100, flush_interval: float = 5.0):
2025-08-03 00:40:34 +02:00
self.db_path = db_path
2025-12-28 06:03:12 +01:00
self.batch_size = batch_size
self.flush_interval = flush_interval
self._db: Optional[dataset.Database] = None
self._rant_batch: List[Dict[str, Any]] = []
self._comment_batch: List[Dict[str, Any]] = []
self._user_batch: List[Dict[str, Any]] = []
self._flush_task: Optional[asyncio.Task] = None
self._lock = asyncio.Lock()
2025-08-03 00:40:34 +02:00
async def __aenter__(self):
logging.info(f"Connecting to database at {self.db_path}...")
2025-12-28 06:03:12 +01:00
self._db = dataset.connect(
f"sqlite:///{self.db_path}?check_same_thread=False",
engine_kwargs={"connect_args": {"check_same_thread": False}}
)
await self._create_indexes()
self._flush_task = asyncio.create_task(self._periodic_flush())
2025-08-03 00:40:34 +02:00
logging.info("Database connection successful.")
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
2025-12-28 06:03:12 +01:00
if self._flush_task:
self._flush_task.cancel()
try:
await self._flush_task
except asyncio.CancelledError:
pass
await self.flush_all()
if self._db:
self._db.close()
logging.info("Database connection closed.")
async def _create_indexes(self):
def _sync_create():
self._db.query("CREATE INDEX IF NOT EXISTS idx_rants_user_id ON rants(user_id)")
self._db.query("CREATE INDEX IF NOT EXISTS idx_rants_created_time ON rants(created_time)")
self._db.query("CREATE INDEX IF NOT EXISTS idx_comments_rant_id ON comments(rant_id)")
self._db.query("CREATE INDEX IF NOT EXISTS idx_comments_user_id ON comments(user_id)")
self._db.query("CREATE INDEX IF NOT EXISTS idx_users_username ON users(username)")
await asyncio.to_thread(_sync_create)
logging.info("Database indexes verified.")
async def _periodic_flush(self):
while True:
await asyncio.sleep(self.flush_interval)
await self.flush_all()
async def flush_all(self):
async with self._lock:
await self._flush_rants()
await self._flush_comments()
await self._flush_users()
async def _flush_rants(self):
if not self._rant_batch:
return
batch = self._rant_batch.copy()
self._rant_batch.clear()
def _sync_insert():
table = self._db["rants"]
for rant in batch:
table.upsert(rant, ["id"])
await asyncio.to_thread(_sync_insert)
logging.debug(f"Flushed {len(batch)} rants to database")
async def _flush_comments(self):
if not self._comment_batch:
return
batch = self._comment_batch.copy()
self._comment_batch.clear()
def _sync_insert():
table = self._db["comments"]
for comment in batch:
table.upsert(comment, ["id"])
await asyncio.to_thread(_sync_insert)
logging.debug(f"Flushed {len(batch)} comments to database")
async def _flush_users(self):
if not self._user_batch:
return
batch = self._user_batch.copy()
self._user_batch.clear()
def _sync_insert():
table = self._db["users"]
for user in batch:
table.upsert(user, ["id"])
await asyncio.to_thread(_sync_insert)
logging.debug(f"Flushed {len(batch)} users to database")
def _transform_rant(self, rant: Rant) -> Dict[str, Any]:
attached_image = rant.get("attached_image")
image_url = None
if isinstance(attached_image, dict):
image_url = attached_image.get("url")
elif isinstance(attached_image, str):
image_url = attached_image
tags = rant.get("tags", [])
tags_str = json.dumps(tags) if tags else None
return {
"id": rant["id"],
"user_id": rant["user_id"],
"text": rant["text"],
"score": rant["score"],
"created_time": rant["created_time"],
"num_comments": rant["num_comments"],
"attached_image_url": image_url,
"tags": tags_str,
"link": rant.get("link"),
"vote_state": rant.get("vote_state"),
"user_username": rant.get("user_username"),
"user_score": rant.get("user_score"),
}
def _transform_comment(self, comment: Comment) -> Dict[str, Any]:
return {
"id": comment["id"],
"rant_id": comment["rant_id"],
"user_id": comment["user_id"],
"body": comment["body"],
"score": comment["score"],
"created_time": comment["created_time"],
"vote_state": comment.get("vote_state"),
"user_username": comment.get("user_username"),
"user_score": comment.get("user_score"),
}
def _transform_user(self, user: UserProfile, user_id: int) -> Dict[str, Any]:
return {
"id": user_id,
"username": user["username"],
"score": user["score"],
"about": user.get("about"),
"location": user.get("location"),
"created_time": user.get("created_time"),
"skills": user.get("skills"),
"github": user.get("github"),
"website": user.get("website"),
}
2025-08-03 00:40:34 +02:00
async def add_rant(self, rant: Rant):
2025-12-28 06:03:12 +01:00
async with self._lock:
self._rant_batch.append(self._transform_rant(rant))
if len(self._rant_batch) >= self.batch_size:
await self._flush_rants()
2025-08-03 00:40:34 +02:00
async def add_comment(self, comment: Comment):
2025-12-28 06:03:12 +01:00
async with self._lock:
self._comment_batch.append(self._transform_comment(comment))
if len(self._comment_batch) >= self.batch_size:
await self._flush_comments()
2025-08-03 00:40:34 +02:00
async def add_user(self, user: UserProfile, user_id: int):
2025-12-28 06:03:12 +01:00
async with self._lock:
self._user_batch.append(self._transform_user(user, user_id))
if len(self._user_batch) >= self.batch_size:
await self._flush_users()
2025-08-03 00:40:34 +02:00
async def rant_exists(self, rant_id: int) -> bool:
2025-12-28 06:03:12 +01:00
def _sync_check():
table = self._db["rants"]
return table.find_one(id=rant_id) is not None
return await asyncio.to_thread(_sync_check)
2025-08-03 00:40:34 +02:00
async def user_exists(self, user_id: int) -> bool:
2025-12-28 06:03:12 +01:00
def _sync_check():
table = self._db["users"]
return table.find_one(id=user_id) is not None
return await asyncio.to_thread(_sync_check)
2025-08-03 00:40:34 +02:00
async def get_random_user_ids(self, limit: int) -> List[int]:
2025-12-28 06:03:12 +01:00
logging.info(f"Fetching up to {limit} random user IDs from database for seeding...")
def _sync_fetch():
result = self._db.query(f"SELECT id FROM users ORDER BY RANDOM() LIMIT {limit}")
return [row["id"] for row in result]
user_ids = await asyncio.to_thread(_sync_fetch)
logging.info(f"Found {len(user_ids)} user IDs to seed.")
return user_ids
async def get_all_rant_ids(self) -> List[int]:
def _sync_fetch():
result = self._db.query("SELECT id FROM rants")
return [row["id"] for row in result]
return await asyncio.to_thread(_sync_fetch)
async def get_all_user_ids(self) -> List[int]:
def _sync_fetch():
result = self._db.query("SELECT id FROM users")
return [row["id"] for row in result]
return await asyncio.to_thread(_sync_fetch)
async def save_crawler_state(self, key: str, value: str):
def _sync_save():
table = self._db["crawler_state"]
table.upsert({"key": key, "value": value}, ["key"])
await asyncio.to_thread(_sync_save)
async def load_crawler_state(self, key: str) -> Optional[str]:
def _sync_load():
table = self._db["crawler_state"]
row = table.find_one(key=key)
return row["value"] if row else None
return await asyncio.to_thread(_sync_load)