2025-12-28 06:03:12 +01:00
|
|
|
# retoor <retoor@molodetz.nl>
|
|
|
|
|
import asyncio
|
|
|
|
|
import json
|
2025-08-03 00:40:34 +02:00
|
|
|
import logging
|
2025-12-28 06:03:12 +01:00
|
|
|
from typing import Any, Dict, List, Optional
|
2025-08-13 00:06:44 +02:00
|
|
|
|
2025-12-28 06:03:12 +01:00
|
|
|
import dataset
|
2025-08-13 00:06:44 +02:00
|
|
|
|
|
|
|
|
from devranta.api import Comment, Rant, UserProfile
|
|
|
|
|
|
2025-08-03 00:40:34 +02:00
|
|
|
|
|
|
|
|
class DatabaseManager:
|
2025-12-28 06:03:12 +01:00
|
|
|
def __init__(self, db_path: str, batch_size: int = 100, flush_interval: float = 5.0):
|
2025-08-03 00:40:34 +02:00
|
|
|
self.db_path = db_path
|
2025-12-28 06:03:12 +01:00
|
|
|
self.batch_size = batch_size
|
|
|
|
|
self.flush_interval = flush_interval
|
|
|
|
|
self._db: Optional[dataset.Database] = None
|
|
|
|
|
self._rant_batch: List[Dict[str, Any]] = []
|
|
|
|
|
self._comment_batch: List[Dict[str, Any]] = []
|
|
|
|
|
self._user_batch: List[Dict[str, Any]] = []
|
|
|
|
|
self._flush_task: Optional[asyncio.Task] = None
|
|
|
|
|
self._lock = asyncio.Lock()
|
2025-08-03 00:40:34 +02:00
|
|
|
|
|
|
|
|
async def __aenter__(self):
|
|
|
|
|
logging.info(f"Connecting to database at {self.db_path}...")
|
2025-12-28 06:03:12 +01:00
|
|
|
self._db = dataset.connect(
|
|
|
|
|
f"sqlite:///{self.db_path}?check_same_thread=False",
|
|
|
|
|
engine_kwargs={"connect_args": {"check_same_thread": False}}
|
|
|
|
|
)
|
|
|
|
|
await self._create_indexes()
|
|
|
|
|
self._flush_task = asyncio.create_task(self._periodic_flush())
|
2025-08-03 00:40:34 +02:00
|
|
|
logging.info("Database connection successful.")
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
2025-12-28 06:03:12 +01:00
|
|
|
if self._flush_task:
|
|
|
|
|
self._flush_task.cancel()
|
|
|
|
|
try:
|
|
|
|
|
await self._flush_task
|
|
|
|
|
except asyncio.CancelledError:
|
|
|
|
|
pass
|
|
|
|
|
await self.flush_all()
|
|
|
|
|
if self._db:
|
|
|
|
|
self._db.close()
|
|
|
|
|
logging.info("Database connection closed.")
|
|
|
|
|
|
|
|
|
|
async def _create_indexes(self):
|
|
|
|
|
def _sync_create():
|
|
|
|
|
self._db.query("CREATE INDEX IF NOT EXISTS idx_rants_user_id ON rants(user_id)")
|
|
|
|
|
self._db.query("CREATE INDEX IF NOT EXISTS idx_rants_created_time ON rants(created_time)")
|
|
|
|
|
self._db.query("CREATE INDEX IF NOT EXISTS idx_comments_rant_id ON comments(rant_id)")
|
|
|
|
|
self._db.query("CREATE INDEX IF NOT EXISTS idx_comments_user_id ON comments(user_id)")
|
|
|
|
|
self._db.query("CREATE INDEX IF NOT EXISTS idx_users_username ON users(username)")
|
|
|
|
|
await asyncio.to_thread(_sync_create)
|
|
|
|
|
logging.info("Database indexes verified.")
|
|
|
|
|
|
|
|
|
|
async def _periodic_flush(self):
|
|
|
|
|
while True:
|
|
|
|
|
await asyncio.sleep(self.flush_interval)
|
|
|
|
|
await self.flush_all()
|
|
|
|
|
|
|
|
|
|
async def flush_all(self):
|
|
|
|
|
async with self._lock:
|
|
|
|
|
await self._flush_rants()
|
|
|
|
|
await self._flush_comments()
|
|
|
|
|
await self._flush_users()
|
|
|
|
|
|
|
|
|
|
async def _flush_rants(self):
|
|
|
|
|
if not self._rant_batch:
|
|
|
|
|
return
|
|
|
|
|
batch = self._rant_batch.copy()
|
|
|
|
|
self._rant_batch.clear()
|
|
|
|
|
|
|
|
|
|
def _sync_insert():
|
|
|
|
|
table = self._db["rants"]
|
|
|
|
|
for rant in batch:
|
|
|
|
|
table.upsert(rant, ["id"])
|
|
|
|
|
|
|
|
|
|
await asyncio.to_thread(_sync_insert)
|
|
|
|
|
logging.debug(f"Flushed {len(batch)} rants to database")
|
|
|
|
|
|
|
|
|
|
async def _flush_comments(self):
|
|
|
|
|
if not self._comment_batch:
|
|
|
|
|
return
|
|
|
|
|
batch = self._comment_batch.copy()
|
|
|
|
|
self._comment_batch.clear()
|
|
|
|
|
|
|
|
|
|
def _sync_insert():
|
|
|
|
|
table = self._db["comments"]
|
|
|
|
|
for comment in batch:
|
|
|
|
|
table.upsert(comment, ["id"])
|
|
|
|
|
|
|
|
|
|
await asyncio.to_thread(_sync_insert)
|
|
|
|
|
logging.debug(f"Flushed {len(batch)} comments to database")
|
|
|
|
|
|
|
|
|
|
async def _flush_users(self):
|
|
|
|
|
if not self._user_batch:
|
|
|
|
|
return
|
|
|
|
|
batch = self._user_batch.copy()
|
|
|
|
|
self._user_batch.clear()
|
|
|
|
|
|
|
|
|
|
def _sync_insert():
|
|
|
|
|
table = self._db["users"]
|
|
|
|
|
for user in batch:
|
|
|
|
|
table.upsert(user, ["id"])
|
|
|
|
|
|
|
|
|
|
await asyncio.to_thread(_sync_insert)
|
|
|
|
|
logging.debug(f"Flushed {len(batch)} users to database")
|
|
|
|
|
|
|
|
|
|
def _transform_rant(self, rant: Rant) -> Dict[str, Any]:
|
|
|
|
|
attached_image = rant.get("attached_image")
|
|
|
|
|
image_url = None
|
|
|
|
|
if isinstance(attached_image, dict):
|
|
|
|
|
image_url = attached_image.get("url")
|
|
|
|
|
elif isinstance(attached_image, str):
|
|
|
|
|
image_url = attached_image
|
|
|
|
|
|
|
|
|
|
tags = rant.get("tags", [])
|
|
|
|
|
tags_str = json.dumps(tags) if tags else None
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
"id": rant["id"],
|
|
|
|
|
"user_id": rant["user_id"],
|
|
|
|
|
"text": rant["text"],
|
|
|
|
|
"score": rant["score"],
|
|
|
|
|
"created_time": rant["created_time"],
|
|
|
|
|
"num_comments": rant["num_comments"],
|
|
|
|
|
"attached_image_url": image_url,
|
|
|
|
|
"tags": tags_str,
|
|
|
|
|
"link": rant.get("link"),
|
|
|
|
|
"vote_state": rant.get("vote_state"),
|
|
|
|
|
"user_username": rant.get("user_username"),
|
|
|
|
|
"user_score": rant.get("user_score"),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def _transform_comment(self, comment: Comment) -> Dict[str, Any]:
|
|
|
|
|
return {
|
|
|
|
|
"id": comment["id"],
|
|
|
|
|
"rant_id": comment["rant_id"],
|
|
|
|
|
"user_id": comment["user_id"],
|
|
|
|
|
"body": comment["body"],
|
|
|
|
|
"score": comment["score"],
|
|
|
|
|
"created_time": comment["created_time"],
|
|
|
|
|
"vote_state": comment.get("vote_state"),
|
|
|
|
|
"user_username": comment.get("user_username"),
|
|
|
|
|
"user_score": comment.get("user_score"),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def _transform_user(self, user: UserProfile, user_id: int) -> Dict[str, Any]:
|
|
|
|
|
return {
|
|
|
|
|
"id": user_id,
|
|
|
|
|
"username": user["username"],
|
|
|
|
|
"score": user["score"],
|
|
|
|
|
"about": user.get("about"),
|
|
|
|
|
"location": user.get("location"),
|
|
|
|
|
"created_time": user.get("created_time"),
|
|
|
|
|
"skills": user.get("skills"),
|
|
|
|
|
"github": user.get("github"),
|
|
|
|
|
"website": user.get("website"),
|
|
|
|
|
}
|
2025-08-03 00:40:34 +02:00
|
|
|
|
|
|
|
|
async def add_rant(self, rant: Rant):
|
2025-12-28 06:03:12 +01:00
|
|
|
async with self._lock:
|
|
|
|
|
self._rant_batch.append(self._transform_rant(rant))
|
|
|
|
|
if len(self._rant_batch) >= self.batch_size:
|
|
|
|
|
await self._flush_rants()
|
2025-08-03 00:40:34 +02:00
|
|
|
|
|
|
|
|
async def add_comment(self, comment: Comment):
|
2025-12-28 06:03:12 +01:00
|
|
|
async with self._lock:
|
|
|
|
|
self._comment_batch.append(self._transform_comment(comment))
|
|
|
|
|
if len(self._comment_batch) >= self.batch_size:
|
|
|
|
|
await self._flush_comments()
|
2025-08-03 00:40:34 +02:00
|
|
|
|
|
|
|
|
async def add_user(self, user: UserProfile, user_id: int):
|
2025-12-28 06:03:12 +01:00
|
|
|
async with self._lock:
|
|
|
|
|
self._user_batch.append(self._transform_user(user, user_id))
|
|
|
|
|
if len(self._user_batch) >= self.batch_size:
|
|
|
|
|
await self._flush_users()
|
2025-08-03 00:40:34 +02:00
|
|
|
|
|
|
|
|
async def rant_exists(self, rant_id: int) -> bool:
|
2025-12-28 06:03:12 +01:00
|
|
|
def _sync_check():
|
|
|
|
|
table = self._db["rants"]
|
|
|
|
|
return table.find_one(id=rant_id) is not None
|
|
|
|
|
return await asyncio.to_thread(_sync_check)
|
2025-08-03 00:40:34 +02:00
|
|
|
|
|
|
|
|
async def user_exists(self, user_id: int) -> bool:
|
2025-12-28 06:03:12 +01:00
|
|
|
def _sync_check():
|
|
|
|
|
table = self._db["users"]
|
|
|
|
|
return table.find_one(id=user_id) is not None
|
|
|
|
|
return await asyncio.to_thread(_sync_check)
|
2025-08-03 00:40:34 +02:00
|
|
|
|
|
|
|
|
async def get_random_user_ids(self, limit: int) -> List[int]:
|
2025-12-28 06:03:12 +01:00
|
|
|
logging.info(f"Fetching up to {limit} random user IDs from database for seeding...")
|
|
|
|
|
|
|
|
|
|
def _sync_fetch():
|
|
|
|
|
result = self._db.query(f"SELECT id FROM users ORDER BY RANDOM() LIMIT {limit}")
|
|
|
|
|
return [row["id"] for row in result]
|
|
|
|
|
|
|
|
|
|
user_ids = await asyncio.to_thread(_sync_fetch)
|
|
|
|
|
logging.info(f"Found {len(user_ids)} user IDs to seed.")
|
|
|
|
|
return user_ids
|
|
|
|
|
|
|
|
|
|
async def get_all_rant_ids(self) -> List[int]:
|
|
|
|
|
def _sync_fetch():
|
|
|
|
|
result = self._db.query("SELECT id FROM rants")
|
|
|
|
|
return [row["id"] for row in result]
|
|
|
|
|
return await asyncio.to_thread(_sync_fetch)
|
|
|
|
|
|
|
|
|
|
async def get_all_user_ids(self) -> List[int]:
|
|
|
|
|
def _sync_fetch():
|
|
|
|
|
result = self._db.query("SELECT id FROM users")
|
|
|
|
|
return [row["id"] for row in result]
|
|
|
|
|
return await asyncio.to_thread(_sync_fetch)
|
|
|
|
|
|
|
|
|
|
async def save_crawler_state(self, key: str, value: str):
|
|
|
|
|
def _sync_save():
|
|
|
|
|
table = self._db["crawler_state"]
|
|
|
|
|
table.upsert({"key": key, "value": value}, ["key"])
|
|
|
|
|
await asyncio.to_thread(_sync_save)
|
|
|
|
|
|
|
|
|
|
async def load_crawler_state(self, key: str) -> Optional[str]:
|
|
|
|
|
def _sync_load():
|
|
|
|
|
table = self._db["crawler_state"]
|
|
|
|
|
row = table.find_one(key=key)
|
|
|
|
|
return row["value"] if row else None
|
|
|
|
|
return await asyncio.to_thread(_sync_load)
|