Update.
This commit is contained in:
parent
d0c8156519
commit
3817151750
45
.gitignore
vendored
45
.gitignore
vendored
@ -1,11 +1,40 @@
|
|||||||
.venv
|
# retoor <retoor@molodetz.nl>
|
||||||
.history
|
|
||||||
__pycache__
|
# Python
|
||||||
|
__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
.eggs/
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
|
||||||
|
# Environment
|
||||||
.env
|
.env
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.history/
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
|
||||||
|
# C build artifacts
|
||||||
|
*.o
|
||||||
|
*.so
|
||||||
|
*.a
|
||||||
|
*.c
|
||||||
|
*.h
|
||||||
|
|
||||||
|
# Databases
|
||||||
*.db
|
*.db
|
||||||
examples/crawler/devrant.sqlite-shm
|
*.sqlite
|
||||||
examples/crawler/devrant.sqlite-wal
|
*.sqlite-shm
|
||||||
examples/crawler/devrant.sqlite
|
*.sqlite-wal
|
||||||
examples/crawler/.venv
|
|
||||||
examples/crawler/__pycache__
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|||||||
37
README.md
37
README.md
@ -1,28 +1,45 @@
|
|||||||
# devRanta
|
# devRanta
|
||||||
devRanta is the best async devRant client written in Python. Authentication is only needed for half of the functionality; thus, the username and password are optional parameters when constructing the main class of this package (Api). You can find the latest packages in tar and wheel format [here](https://retoor.molodetz.nl/retoor/devranta/packages).
|
|
||||||
|
Author: retoor <retoor@molodetz.nl>
|
||||||
|
|
||||||
|
An asynchronous Python client for the devRant API. Authentication is only required for write operations; read-only endpoints work without credentials. Packages available in tar and wheel format [here](https://retoor.molodetz.nl/retoor/devranta/packages).
|
||||||
|
|
||||||
## Running
|
## Running
|
||||||
```
|
```
|
||||||
make run
|
make run
|
||||||
```
|
```
|
||||||
## Testing
|
## Testing
|
||||||
Tests are only made for methods not requireing authentication.
|
|
||||||
I do not see value in mocking requests.
|
Tests cover methods not requiring authentication.
|
||||||
|
|
||||||
```
|
```
|
||||||
make test
|
make test
|
||||||
```
|
```
|
||||||
## How to use
|
|
||||||
Implementation:
|
## Usage
|
||||||
```
|
|
||||||
|
```python
|
||||||
from devranta.api import Api
|
from devranta.api import Api
|
||||||
api = Api(username="optional!", password="optional!")
|
|
||||||
|
api = Api(username="optional", password="optional")
|
||||||
|
|
||||||
async def list_rants():
|
async def list_rants():
|
||||||
async for rant in api.get_rants():
|
async for rant in api.get_rants():
|
||||||
print(rant["user_username"], ":", rant["text"])
|
print(rant["user_username"], ":", rant["text"])
|
||||||
```
|
```
|
||||||
See [tests](src/devranta/tests.py) for [examples](src/devranta/tests.py) on how to use.
|
|
||||||
|
See [tests](src/devranta/tests.py) for additional examples.
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
| Example | Description |
|
||||||
|
|---------|-------------|
|
||||||
|
| [crawler](examples/crawler/) | Asynchronous data collection with producer-consumer architecture |
|
||||||
|
| [princess](examples/princess/) | Automated response bot with LLM integration |
|
||||||
|
|
||||||
# devRant API Documentation
|
# devRant API Documentation
|
||||||
For people wanting to build their own client.
|
|
||||||
TODO: document responses.
|
Reference for building custom clients.
|
||||||
## Base URL
|
## Base URL
|
||||||
`https://devrant.com/api`
|
`https://devrant.com/api`
|
||||||
## Authentication
|
## Authentication
|
||||||
|
|||||||
@ -1,10 +1,10 @@
|
|||||||
|
# retoor <retoor@molodetz.nl>
|
||||||
.PHONY: all env install run clean
|
.PHONY: all env install run clean
|
||||||
|
|
||||||
all: env install run
|
all: env install run
|
||||||
|
|
||||||
env:
|
env:
|
||||||
python3 -m venv .venv
|
python3 -m venv .venv
|
||||||
.
|
|
||||||
|
|
||||||
install:
|
install:
|
||||||
. .venv/bin/activate && pip install -r requirements.txt
|
. .venv/bin/activate && pip install -r requirements.txt
|
||||||
|
|||||||
@ -1,34 +1,122 @@
|
|||||||
# Example Crawler Project
|
# devRant Exhaustive Crawler
|
||||||
|
|
||||||
This is a simple example crawler project. Follow the instructions below to set up and run the crawler.
|
Author: retoor <retoor@molodetz.nl>
|
||||||
|
|
||||||
## Setup
|
An asynchronous crawler for comprehensive data collection from the devRant platform. Implements a producer-consumer architecture with multiple discovery strategies to maximize content coverage.
|
||||||
|
|
||||||
1. Clone the repository or copy the project files to your local machine.
|
## SSL Note
|
||||||
2. Make sure you have Python 3 installed.
|
|
||||||
|
The devRant API SSL certificate is expired. This crawler disables SSL verification to maintain connectivity. This is handled automatically by the API client.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
The crawler employs four concurrent producers feeding into worker pools:
|
||||||
|
|
||||||
|
| Producer | Strategy | Interval |
|
||||||
|
|----------|----------|----------|
|
||||||
|
| Recent | Paginate through recent rants | 2s |
|
||||||
|
| Top | Paginate through top-rated rants | 5s |
|
||||||
|
| Algo | Paginate through algorithm-sorted rants | 5s |
|
||||||
|
| Search | Cycle through 48 programming-related search terms | 30s |
|
||||||
|
|
||||||
|
Worker pools process discovered content:
|
||||||
|
- 10 rant consumers fetch rant details and extract comments
|
||||||
|
- 5 user consumers fetch profiles and discover associated rants
|
||||||
|
|
||||||
|
Discovery graph: rants reveal users, users reveal more rants (from their profile, upvoted, favorites).
|
||||||
|
|
||||||
|
## Data Storage
|
||||||
|
|
||||||
|
Uses SQLite via the dataset library with:
|
||||||
|
- Batched writes (100 items or 5s interval)
|
||||||
|
- Automatic upsert for deduplication
|
||||||
|
- Indexes on user_id, created_time, rant_id
|
||||||
|
- State persistence for resume capability
|
||||||
|
|
||||||
|
### Schema
|
||||||
|
|
||||||
|
**rants**: id, user_id, text, score, created_time, num_comments, attached_image_url, tags, link, vote_state, user_username, user_score
|
||||||
|
|
||||||
|
**comments**: id, rant_id, user_id, body, score, created_time, vote_state, user_username, user_score
|
||||||
|
|
||||||
|
**users**: id, username, score, about, location, created_time, skills, github, website
|
||||||
|
|
||||||
|
**crawler_state**: Persists producer positions (skip values, search term index)
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
1. Open a terminal in the project directory.
|
### Quick Start
|
||||||
2. Run `make` to set up the environment, install dependencies, and start the crawler:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
This will create a virtual environment, install the package in editable mode from the parent directory, and run the main script.
|
This creates a virtual environment, installs dependencies, and starts the crawler.
|
||||||
|
|
||||||
|
### Manual Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install -e ../../.
|
||||||
|
pip install -r requirements.txt
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Stopping
|
||||||
|
|
||||||
|
Press `Ctrl+C` for graceful shutdown. The crawler will:
|
||||||
|
1. Save current state to database
|
||||||
|
2. Wait up to 30 seconds for queues to drain
|
||||||
|
3. Flush remaining batched writes
|
||||||
|
|
||||||
|
### Resuming
|
||||||
|
|
||||||
|
Simply run again. The crawler loads saved state and continues from where it stopped.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Edit `main.py` to adjust:
|
||||||
|
|
||||||
|
```python
|
||||||
|
DB_FILE = "devrant.sqlite"
|
||||||
|
CONCURRENT_RANT_CONSUMERS = 10
|
||||||
|
CONCURRENT_USER_CONSUMERS = 5
|
||||||
|
BATCH_SIZE = 100
|
||||||
|
FLUSH_INTERVAL = 5.0
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
The crawler logs statistics every 15 seconds:
|
||||||
|
|
||||||
|
```
|
||||||
|
[STATS] Rants Q'd/Proc: 1250/1200 | Users Q'd/Proc: 450/400 | Comments DB: 5600 | Queues (R/U): 50/50 | API Errors: 0
|
||||||
|
```
|
||||||
|
|
||||||
## Cleanup
|
## Cleanup
|
||||||
|
|
||||||
To remove the virtual environment, run:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make clean
|
make clean
|
||||||
```
|
```
|
||||||
|
|
||||||
## Notes
|
Removes the virtual environment. Database file (`devrant.sqlite`) is preserved.
|
||||||
|
|
||||||
- The project installs the package with `-e ../../.` to include the parent package `devranta` in editable mode.
|
## Requirements
|
||||||
- Ensure that the parent package is correctly set up in the directory structure.
|
|
||||||
|
|
||||||
Happy crawling!
|
- Python 3.10+
|
||||||
|
- dataset
|
||||||
|
- aiohttp (via parent devranta package)
|
||||||
|
|
||||||
|
## File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
crawler/
|
||||||
|
├── main.py # Entry point, configuration
|
||||||
|
├── crawler.py # Producer-consumer implementation
|
||||||
|
├── database.py # Dataset wrapper with batch queue
|
||||||
|
├── requirements.txt # Dependencies
|
||||||
|
├── Makefile # Build automation
|
||||||
|
├── .venv/ # Virtual environment (created on first run)
|
||||||
|
└── devrant.sqlite # SQLite database (created on first run)
|
||||||
|
```
|
||||||
|
|||||||
@ -1,12 +1,50 @@
|
|||||||
|
# retoor <retoor@molodetz.nl>
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
|
from collections import OrderedDict
|
||||||
from typing import Set
|
from typing import Set
|
||||||
|
|
||||||
from database import DatabaseManager
|
from database import DatabaseManager
|
||||||
|
|
||||||
from devranta.api import Api, Rant
|
from devranta.api import Api, Rant
|
||||||
|
|
||||||
|
|
||||||
|
SEARCH_TERMS = [
|
||||||
|
"python", "javascript", "java", "csharp", "golang", "rust",
|
||||||
|
"react", "angular", "vue", "node", "docker", "kubernetes",
|
||||||
|
"linux", "windows", "macos", "git", "github", "gitlab",
|
||||||
|
"sql", "mongodb", "redis", "api", "rest", "graphql",
|
||||||
|
"bug", "error", "crash", "debug", "fix", "issue",
|
||||||
|
"manager", "deadline", "meeting", "standup", "agile", "scrum",
|
||||||
|
"frontend", "backend", "fullstack", "devops", "cloud", "aws",
|
||||||
|
"typescript", "php", "ruby", "swift", "kotlin", "flutter",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class BoundedSeenSet:
|
||||||
|
def __init__(self, maxsize: int = 100000):
|
||||||
|
self._set = OrderedDict()
|
||||||
|
self._maxsize = maxsize
|
||||||
|
|
||||||
|
def add(self, item: int):
|
||||||
|
if item in self._set:
|
||||||
|
self._set.move_to_end(item)
|
||||||
|
else:
|
||||||
|
self._set[item] = None
|
||||||
|
if len(self._set) > self._maxsize:
|
||||||
|
self._set.popitem(last=False)
|
||||||
|
|
||||||
|
def __contains__(self, item: int) -> bool:
|
||||||
|
return item in self._set
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self._set)
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
self._set.clear()
|
||||||
|
|
||||||
|
|
||||||
class DevRantCrawler:
|
class DevRantCrawler:
|
||||||
def __init__(
|
def __init__(
|
||||||
self, api: Api, db: DatabaseManager, rant_consumers: int, user_consumers: int
|
self, api: Api, db: DatabaseManager, rant_consumers: int, user_consumers: int
|
||||||
@ -20,8 +58,14 @@ class DevRantCrawler:
|
|||||||
self.num_rant_consumers = rant_consumers
|
self.num_rant_consumers = rant_consumers
|
||||||
self.num_user_consumers = user_consumers
|
self.num_user_consumers = user_consumers
|
||||||
|
|
||||||
self.seen_rant_ids: Set[int] = set()
|
self.seen_rant_ids = BoundedSeenSet(maxsize=100000)
|
||||||
self.seen_user_ids: Set[int] = set()
|
self.seen_user_ids = BoundedSeenSet(maxsize=100000)
|
||||||
|
|
||||||
|
self._recent_skip = 0
|
||||||
|
self._top_skip = 0
|
||||||
|
self._algo_skip = 0
|
||||||
|
self._search_term_index = 0
|
||||||
|
|
||||||
self.stats = {
|
self.stats = {
|
||||||
"rants_processed": 0,
|
"rants_processed": 0,
|
||||||
"rants_added_to_db": 0,
|
"rants_added_to_db": 0,
|
||||||
@ -74,29 +118,58 @@ class DevRantCrawler:
|
|||||||
f"Seeder finished: Queued {len(user_ids)} users to kickstart exploration."
|
f"Seeder finished: Queued {len(user_ids)} users to kickstart exploration."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def _save_state(self):
|
||||||
|
state = {
|
||||||
|
"recent_skip": self._recent_skip,
|
||||||
|
"top_skip": self._top_skip,
|
||||||
|
"algo_skip": self._algo_skip,
|
||||||
|
"search_term_index": self._search_term_index,
|
||||||
|
"last_saved": int(time.time()),
|
||||||
|
}
|
||||||
|
await self.db.save_crawler_state("producer_state", json.dumps(state))
|
||||||
|
logging.debug("Crawler state saved.")
|
||||||
|
|
||||||
|
async def _load_state(self):
|
||||||
|
state_json = await self.db.load_crawler_state("producer_state")
|
||||||
|
if state_json:
|
||||||
|
try:
|
||||||
|
state = json.loads(state_json)
|
||||||
|
self._recent_skip = state.get("recent_skip", 0)
|
||||||
|
self._top_skip = state.get("top_skip", 0)
|
||||||
|
self._algo_skip = state.get("algo_skip", 0)
|
||||||
|
self._search_term_index = state.get("search_term_index", 0)
|
||||||
|
logging.info(f"Loaded crawler state: {state}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logging.warning("Failed to decode crawler state, starting fresh.")
|
||||||
|
|
||||||
|
async def _state_saver(self):
|
||||||
|
logging.info("State saver started.")
|
||||||
|
while not self.shutdown_event.is_set():
|
||||||
|
await asyncio.sleep(60)
|
||||||
|
await self._save_state()
|
||||||
|
|
||||||
async def _rant_producer(self):
|
async def _rant_producer(self):
|
||||||
logging.info("Rant producer started.")
|
logging.info("Recent rant producer started.")
|
||||||
skip = 0
|
|
||||||
consecutive_empty_responses = 0
|
consecutive_empty_responses = 0
|
||||||
|
|
||||||
while not self.shutdown_event.is_set():
|
while not self.shutdown_event.is_set():
|
||||||
try:
|
try:
|
||||||
logging.info(f"Producer: Fetching rants with skip={skip}...")
|
logging.debug(f"Recent producer: Fetching rants with skip={self._recent_skip}...")
|
||||||
rants = await self.api.get_rants(sort="recent", limit=50, skip=skip)
|
rants = await self.api.get_rants(sort="recent", limit=50, skip=self._recent_skip)
|
||||||
self.stats["producer_loops"] += 1
|
self.stats["producer_loops"] += 1
|
||||||
|
|
||||||
if not rants:
|
if not rants:
|
||||||
consecutive_empty_responses += 1
|
consecutive_empty_responses += 1
|
||||||
logging.info(
|
logging.debug(
|
||||||
f"Producer: Feed returned empty. Consecutive empty hits: {consecutive_empty_responses}."
|
f"Recent producer: Feed returned empty. Consecutive empty hits: {consecutive_empty_responses}."
|
||||||
)
|
)
|
||||||
if consecutive_empty_responses >= 5:
|
if consecutive_empty_responses >= 5:
|
||||||
self.stats["end_of_feed_hits"] += 1
|
self.stats["end_of_feed_hits"] += 1
|
||||||
logging.info(
|
logging.info(
|
||||||
"Producer: End of feed likely reached. Pausing for 15 minutes before reset."
|
"Recent producer: End of feed likely reached. Pausing for 15 minutes before reset."
|
||||||
)
|
)
|
||||||
await asyncio.sleep(900)
|
await asyncio.sleep(900)
|
||||||
skip = 0
|
self._recent_skip = 0
|
||||||
consecutive_empty_responses = 0
|
consecutive_empty_responses = 0
|
||||||
else:
|
else:
|
||||||
await asyncio.sleep(10)
|
await asyncio.sleep(10)
|
||||||
@ -108,25 +181,98 @@ class DevRantCrawler:
|
|||||||
await self._queue_rant_if_new(rant)
|
await self._queue_rant_if_new(rant)
|
||||||
new_rants_found += 1
|
new_rants_found += 1
|
||||||
|
|
||||||
logging.info(
|
logging.debug(
|
||||||
f"Producer: Processed {new_rants_found} rants from feed. Total queued: {self.stats['rants_queued']}."
|
f"Recent producer: Processed {new_rants_found} rants from feed."
|
||||||
)
|
)
|
||||||
skip += len(rants)
|
self._recent_skip += len(rants)
|
||||||
await asyncio.sleep(2)
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.critical(
|
logging.error(
|
||||||
f"Producer: Unhandled exception: {e}. Retrying in 60s."
|
f"Recent producer: Unhandled exception: {e}. Retrying in 60s."
|
||||||
)
|
)
|
||||||
self.stats["api_errors"] += 1
|
self.stats["api_errors"] += 1
|
||||||
await asyncio.sleep(60)
|
await asyncio.sleep(60)
|
||||||
|
|
||||||
|
async def _top_rant_producer(self):
|
||||||
|
logging.info("Top rant producer started.")
|
||||||
|
|
||||||
|
while not self.shutdown_event.is_set():
|
||||||
|
try:
|
||||||
|
logging.debug(f"Top producer: Fetching rants with skip={self._top_skip}...")
|
||||||
|
rants = await self.api.get_rants(sort="top", limit=50, skip=self._top_skip)
|
||||||
|
|
||||||
|
if not rants:
|
||||||
|
logging.info("Top producer: End of feed reached. Resetting after 1 hour.")
|
||||||
|
self._top_skip = 0
|
||||||
|
await asyncio.sleep(3600)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for rant in rants:
|
||||||
|
await self._queue_rant_if_new(rant)
|
||||||
|
|
||||||
|
logging.debug(f"Top producer: Processed {len(rants)} rants.")
|
||||||
|
self._top_skip += len(rants)
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Top producer: Unhandled exception: {e}. Retrying in 60s.")
|
||||||
|
self.stats["api_errors"] += 1
|
||||||
|
await asyncio.sleep(60)
|
||||||
|
|
||||||
|
async def _algo_rant_producer(self):
|
||||||
|
logging.info("Algo rant producer started.")
|
||||||
|
|
||||||
|
while not self.shutdown_event.is_set():
|
||||||
|
try:
|
||||||
|
logging.debug(f"Algo producer: Fetching rants with skip={self._algo_skip}...")
|
||||||
|
rants = await self.api.get_rants(sort="algo", limit=50, skip=self._algo_skip)
|
||||||
|
|
||||||
|
if not rants:
|
||||||
|
logging.info("Algo producer: End of feed reached. Resetting after 1 hour.")
|
||||||
|
self._algo_skip = 0
|
||||||
|
await asyncio.sleep(3600)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for rant in rants:
|
||||||
|
await self._queue_rant_if_new(rant)
|
||||||
|
|
||||||
|
logging.debug(f"Algo producer: Processed {len(rants)} rants.")
|
||||||
|
self._algo_skip += len(rants)
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Algo producer: Unhandled exception: {e}. Retrying in 60s.")
|
||||||
|
self.stats["api_errors"] += 1
|
||||||
|
await asyncio.sleep(60)
|
||||||
|
|
||||||
|
async def _search_producer(self):
|
||||||
|
logging.info("Search producer started.")
|
||||||
|
|
||||||
|
while not self.shutdown_event.is_set():
|
||||||
|
try:
|
||||||
|
term = SEARCH_TERMS[self._search_term_index % len(SEARCH_TERMS)]
|
||||||
|
logging.debug(f"Search producer: Searching for '{term}'...")
|
||||||
|
rants = await self.api.search(term)
|
||||||
|
|
||||||
|
for rant in rants:
|
||||||
|
await self._queue_rant_if_new(rant)
|
||||||
|
|
||||||
|
logging.debug(f"Search producer: Found {len(rants)} rants for '{term}'.")
|
||||||
|
self._search_term_index += 1
|
||||||
|
await asyncio.sleep(30)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Search producer: Unhandled exception: {e}. Retrying in 60s.")
|
||||||
|
self.stats["api_errors"] += 1
|
||||||
|
await asyncio.sleep(60)
|
||||||
|
|
||||||
async def _rant_consumer(self, worker_id: int):
|
async def _rant_consumer(self, worker_id: int):
|
||||||
logging.info(f"Rant consumer #{worker_id} started.")
|
logging.info(f"Rant consumer #{worker_id} started.")
|
||||||
while not self.shutdown_event.is_set():
|
while not self.shutdown_event.is_set():
|
||||||
try:
|
try:
|
||||||
rant_id = await self.rant_queue.get()
|
rant_id = await asyncio.wait_for(self.rant_queue.get(), timeout=5.0)
|
||||||
logging.info(
|
logging.debug(
|
||||||
f"Rant consumer #{worker_id}: Processing rant ID {rant_id}."
|
f"Rant consumer #{worker_id}: Processing rant ID {rant_id}."
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -146,22 +292,27 @@ class DevRantCrawler:
|
|||||||
self.stats["comments_added_to_db"] += 1
|
self.stats["comments_added_to_db"] += 1
|
||||||
await self._queue_user_if_new(comment["user_id"])
|
await self._queue_user_if_new(comment["user_id"])
|
||||||
|
|
||||||
logging.info(
|
logging.debug(
|
||||||
f"Rant consumer #{worker_id}: Finished processing rant {rant_id}, found {len(comments)} comments."
|
f"Rant consumer #{worker_id}: Finished processing rant {rant_id}, found {len(comments)} comments."
|
||||||
)
|
)
|
||||||
self.stats["rants_processed"] += 1
|
self.stats["rants_processed"] += 1
|
||||||
self.rant_queue.task_done()
|
self.rant_queue.task_done()
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Rant consumer #{worker_id}: Unhandled exception: {e}")
|
logging.error(f"Rant consumer #{worker_id}: Unhandled exception: {e}")
|
||||||
|
try:
|
||||||
self.rant_queue.task_done()
|
self.rant_queue.task_done()
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
async def _user_consumer(self, worker_id: int):
|
async def _user_consumer(self, worker_id: int):
|
||||||
logging.info(f"User consumer #{worker_id} started.")
|
logging.info(f"User consumer #{worker_id} started.")
|
||||||
while not self.shutdown_event.is_set():
|
while not self.shutdown_event.is_set():
|
||||||
try:
|
try:
|
||||||
user_id = await self.user_queue.get()
|
user_id = await asyncio.wait_for(self.user_queue.get(), timeout=5.0)
|
||||||
logging.info(
|
logging.debug(
|
||||||
f"User consumer #{worker_id}: Processing user ID {user_id}."
|
f"User consumer #{worker_id}: Processing user ID {user_id}."
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -183,14 +334,20 @@ class DevRantCrawler:
|
|||||||
await self._queue_rant_if_new(rant_obj)
|
await self._queue_rant_if_new(rant_obj)
|
||||||
rants_found_on_profile += 1
|
rants_found_on_profile += 1
|
||||||
|
|
||||||
logging.info(
|
logging.debug(
|
||||||
f"User consumer #{worker_id}: Finished user {user_id}, found and queued {rants_found_on_profile} associated rants."
|
f"User consumer #{worker_id}: Finished user {user_id}, found and queued {rants_found_on_profile} associated rants."
|
||||||
)
|
)
|
||||||
self.stats["users_processed"] += 1
|
self.stats["users_processed"] += 1
|
||||||
self.user_queue.task_done()
|
self.user_queue.task_done()
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"User consumer #{worker_id}: Unhandled exception: {e}")
|
logging.error(f"User consumer #{worker_id}: Unhandled exception: {e}")
|
||||||
|
try:
|
||||||
self.user_queue.task_done()
|
self.user_queue.task_done()
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
async def _stats_reporter(self):
|
async def _stats_reporter(self):
|
||||||
logging.info("Stats reporter started.")
|
logging.info("Stats reporter started.")
|
||||||
@ -206,13 +363,18 @@ class DevRantCrawler:
|
|||||||
|
|
||||||
async def run(self):
|
async def run(self):
|
||||||
logging.info("Exhaustive crawler starting...")
|
logging.info("Exhaustive crawler starting...")
|
||||||
|
await self._load_state()
|
||||||
await self._initial_seed()
|
await self._initial_seed()
|
||||||
|
|
||||||
logging.info("Starting main producer and consumer tasks...")
|
logging.info("Starting main producer and consumer tasks...")
|
||||||
tasks = []
|
tasks = []
|
||||||
try:
|
try:
|
||||||
tasks.append(asyncio.create_task(self._rant_producer()))
|
tasks.append(asyncio.create_task(self._rant_producer()))
|
||||||
|
tasks.append(asyncio.create_task(self._top_rant_producer()))
|
||||||
|
tasks.append(asyncio.create_task(self._algo_rant_producer()))
|
||||||
|
tasks.append(asyncio.create_task(self._search_producer()))
|
||||||
tasks.append(asyncio.create_task(self._stats_reporter()))
|
tasks.append(asyncio.create_task(self._stats_reporter()))
|
||||||
|
tasks.append(asyncio.create_task(self._state_saver()))
|
||||||
|
|
||||||
for i in range(self.num_rant_consumers):
|
for i in range(self.num_rant_consumers):
|
||||||
tasks.append(asyncio.create_task(self._rant_consumer(i + 1)))
|
tasks.append(asyncio.create_task(self._rant_consumer(i + 1)))
|
||||||
@ -232,6 +394,8 @@ class DevRantCrawler:
|
|||||||
logging.info("Shutting down... sending signal to all tasks.")
|
logging.info("Shutting down... sending signal to all tasks.")
|
||||||
self.shutdown_event.set()
|
self.shutdown_event.set()
|
||||||
|
|
||||||
|
await self._save_state()
|
||||||
|
|
||||||
logging.info("Waiting for queues to empty... Press Ctrl+C again to force exit.")
|
logging.info("Waiting for queues to empty... Press Ctrl+C again to force exit.")
|
||||||
try:
|
try:
|
||||||
await asyncio.wait_for(self.rant_queue.join(), timeout=30)
|
await asyncio.wait_for(self.rant_queue.join(), timeout=30)
|
||||||
|
|||||||
@ -1,130 +1,226 @@
|
|||||||
|
# retoor <retoor@molodetz.nl>
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
from typing import List
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import aiosqlite
|
import dataset
|
||||||
|
|
||||||
from devranta.api import Comment, Rant, UserProfile
|
from devranta.api import Comment, Rant, UserProfile
|
||||||
|
|
||||||
|
|
||||||
class DatabaseManager:
|
class DatabaseManager:
|
||||||
def __init__(self, db_path: str):
|
def __init__(self, db_path: str, batch_size: int = 100, flush_interval: float = 5.0):
|
||||||
self.db_path = db_path
|
self.db_path = db_path
|
||||||
self._conn: aiosqlite.Connection | None = None
|
self.batch_size = batch_size
|
||||||
|
self.flush_interval = flush_interval
|
||||||
|
self._db: Optional[dataset.Database] = None
|
||||||
|
self._rant_batch: List[Dict[str, Any]] = []
|
||||||
|
self._comment_batch: List[Dict[str, Any]] = []
|
||||||
|
self._user_batch: List[Dict[str, Any]] = []
|
||||||
|
self._flush_task: Optional[asyncio.Task] = None
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
|
||||||
async def __aenter__(self):
|
async def __aenter__(self):
|
||||||
logging.info(f"Connecting to database at {self.db_path}...")
|
logging.info(f"Connecting to database at {self.db_path}...")
|
||||||
self._conn = await aiosqlite.connect(self.db_path)
|
self._db = dataset.connect(
|
||||||
await self._conn.execute("PRAGMA journal_mode=WAL;")
|
f"sqlite:///{self.db_path}?check_same_thread=False",
|
||||||
await self._conn.execute("PRAGMA foreign_keys=ON;")
|
engine_kwargs={"connect_args": {"check_same_thread": False}}
|
||||||
await self.create_tables()
|
)
|
||||||
|
await self._create_indexes()
|
||||||
|
self._flush_task = asyncio.create_task(self._periodic_flush())
|
||||||
logging.info("Database connection successful.")
|
logging.info("Database connection successful.")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||||
if self._conn:
|
if self._flush_task:
|
||||||
await self._conn.close()
|
self._flush_task.cancel()
|
||||||
|
try:
|
||||||
|
await self._flush_task
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
pass
|
||||||
|
await self.flush_all()
|
||||||
|
if self._db:
|
||||||
|
self._db.close()
|
||||||
logging.info("Database connection closed.")
|
logging.info("Database connection closed.")
|
||||||
|
|
||||||
async def create_tables(self):
|
async def _create_indexes(self):
|
||||||
logging.info("Ensuring database tables exist...")
|
def _sync_create():
|
||||||
await self._conn.executescript(
|
self._db.query("CREATE INDEX IF NOT EXISTS idx_rants_user_id ON rants(user_id)")
|
||||||
"""
|
self._db.query("CREATE INDEX IF NOT EXISTS idx_rants_created_time ON rants(created_time)")
|
||||||
CREATE TABLE IF NOT EXISTS users (
|
self._db.query("CREATE INDEX IF NOT EXISTS idx_comments_rant_id ON comments(rant_id)")
|
||||||
id INTEGER PRIMARY KEY,
|
self._db.query("CREATE INDEX IF NOT EXISTS idx_comments_user_id ON comments(user_id)")
|
||||||
username TEXT NOT NULL UNIQUE,
|
self._db.query("CREATE INDEX IF NOT EXISTS idx_users_username ON users(username)")
|
||||||
score INTEGER,
|
await asyncio.to_thread(_sync_create)
|
||||||
about TEXT,
|
logging.info("Database indexes verified.")
|
||||||
location TEXT,
|
|
||||||
created_time INTEGER,
|
async def _periodic_flush(self):
|
||||||
skills TEXT,
|
while True:
|
||||||
github TEXT,
|
await asyncio.sleep(self.flush_interval)
|
||||||
website TEXT
|
await self.flush_all()
|
||||||
);
|
|
||||||
CREATE TABLE IF NOT EXISTS rants (
|
async def flush_all(self):
|
||||||
id INTEGER PRIMARY KEY,
|
async with self._lock:
|
||||||
user_id INTEGER,
|
await self._flush_rants()
|
||||||
text TEXT,
|
await self._flush_comments()
|
||||||
score INTEGER,
|
await self._flush_users()
|
||||||
created_time INTEGER,
|
|
||||||
num_comments INTEGER
|
async def _flush_rants(self):
|
||||||
);
|
if not self._rant_batch:
|
||||||
CREATE TABLE IF NOT EXISTS comments (
|
return
|
||||||
id INTEGER PRIMARY KEY,
|
batch = self._rant_batch.copy()
|
||||||
rant_id INTEGER,
|
self._rant_batch.clear()
|
||||||
user_id INTEGER,
|
|
||||||
body TEXT,
|
def _sync_insert():
|
||||||
score INTEGER,
|
table = self._db["rants"]
|
||||||
created_time INTEGER
|
for rant in batch:
|
||||||
);
|
table.upsert(rant, ["id"])
|
||||||
"""
|
|
||||||
)
|
await asyncio.to_thread(_sync_insert)
|
||||||
await self._conn.commit()
|
logging.debug(f"Flushed {len(batch)} rants to database")
|
||||||
logging.info("Table schema verified.")
|
|
||||||
|
async def _flush_comments(self):
|
||||||
|
if not self._comment_batch:
|
||||||
|
return
|
||||||
|
batch = self._comment_batch.copy()
|
||||||
|
self._comment_batch.clear()
|
||||||
|
|
||||||
|
def _sync_insert():
|
||||||
|
table = self._db["comments"]
|
||||||
|
for comment in batch:
|
||||||
|
table.upsert(comment, ["id"])
|
||||||
|
|
||||||
|
await asyncio.to_thread(_sync_insert)
|
||||||
|
logging.debug(f"Flushed {len(batch)} comments to database")
|
||||||
|
|
||||||
|
async def _flush_users(self):
|
||||||
|
if not self._user_batch:
|
||||||
|
return
|
||||||
|
batch = self._user_batch.copy()
|
||||||
|
self._user_batch.clear()
|
||||||
|
|
||||||
|
def _sync_insert():
|
||||||
|
table = self._db["users"]
|
||||||
|
for user in batch:
|
||||||
|
table.upsert(user, ["id"])
|
||||||
|
|
||||||
|
await asyncio.to_thread(_sync_insert)
|
||||||
|
logging.debug(f"Flushed {len(batch)} users to database")
|
||||||
|
|
||||||
|
def _transform_rant(self, rant: Rant) -> Dict[str, Any]:
|
||||||
|
attached_image = rant.get("attached_image")
|
||||||
|
image_url = None
|
||||||
|
if isinstance(attached_image, dict):
|
||||||
|
image_url = attached_image.get("url")
|
||||||
|
elif isinstance(attached_image, str):
|
||||||
|
image_url = attached_image
|
||||||
|
|
||||||
|
tags = rant.get("tags", [])
|
||||||
|
tags_str = json.dumps(tags) if tags else None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id": rant["id"],
|
||||||
|
"user_id": rant["user_id"],
|
||||||
|
"text": rant["text"],
|
||||||
|
"score": rant["score"],
|
||||||
|
"created_time": rant["created_time"],
|
||||||
|
"num_comments": rant["num_comments"],
|
||||||
|
"attached_image_url": image_url,
|
||||||
|
"tags": tags_str,
|
||||||
|
"link": rant.get("link"),
|
||||||
|
"vote_state": rant.get("vote_state"),
|
||||||
|
"user_username": rant.get("user_username"),
|
||||||
|
"user_score": rant.get("user_score"),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _transform_comment(self, comment: Comment) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"id": comment["id"],
|
||||||
|
"rant_id": comment["rant_id"],
|
||||||
|
"user_id": comment["user_id"],
|
||||||
|
"body": comment["body"],
|
||||||
|
"score": comment["score"],
|
||||||
|
"created_time": comment["created_time"],
|
||||||
|
"vote_state": comment.get("vote_state"),
|
||||||
|
"user_username": comment.get("user_username"),
|
||||||
|
"user_score": comment.get("user_score"),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _transform_user(self, user: UserProfile, user_id: int) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"id": user_id,
|
||||||
|
"username": user["username"],
|
||||||
|
"score": user["score"],
|
||||||
|
"about": user.get("about"),
|
||||||
|
"location": user.get("location"),
|
||||||
|
"created_time": user.get("created_time"),
|
||||||
|
"skills": user.get("skills"),
|
||||||
|
"github": user.get("github"),
|
||||||
|
"website": user.get("website"),
|
||||||
|
}
|
||||||
|
|
||||||
async def add_rant(self, rant: Rant):
|
async def add_rant(self, rant: Rant):
|
||||||
await self._conn.execute(
|
async with self._lock:
|
||||||
"INSERT OR IGNORE INTO rants (id, user_id, text, score, created_time, num_comments) VALUES (?, ?, ?, ?, ?, ?)",
|
self._rant_batch.append(self._transform_rant(rant))
|
||||||
(
|
if len(self._rant_batch) >= self.batch_size:
|
||||||
rant["id"],
|
await self._flush_rants()
|
||||||
rant["user_id"],
|
|
||||||
rant["text"],
|
|
||||||
rant["score"],
|
|
||||||
rant["created_time"],
|
|
||||||
rant["num_comments"],
|
|
||||||
),
|
|
||||||
)
|
|
||||||
await self._conn.commit()
|
|
||||||
|
|
||||||
async def add_comment(self, comment: Comment):
|
async def add_comment(self, comment: Comment):
|
||||||
await self._conn.execute(
|
async with self._lock:
|
||||||
"INSERT OR IGNORE INTO comments (id, rant_id, user_id, body, score, created_time) VALUES (?, ?, ?, ?, ?, ?)",
|
self._comment_batch.append(self._transform_comment(comment))
|
||||||
(
|
if len(self._comment_batch) >= self.batch_size:
|
||||||
comment["id"],
|
await self._flush_comments()
|
||||||
comment["rant_id"],
|
|
||||||
comment["user_id"],
|
|
||||||
comment["body"],
|
|
||||||
comment["score"],
|
|
||||||
comment["created_time"],
|
|
||||||
),
|
|
||||||
)
|
|
||||||
await self._conn.commit()
|
|
||||||
|
|
||||||
async def add_user(self, user: UserProfile, user_id: int):
|
async def add_user(self, user: UserProfile, user_id: int):
|
||||||
await self._conn.execute(
|
async with self._lock:
|
||||||
"INSERT OR IGNORE INTO users (id, username, score, about, location, created_time, skills, github, website) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
self._user_batch.append(self._transform_user(user, user_id))
|
||||||
(
|
if len(self._user_batch) >= self.batch_size:
|
||||||
user_id,
|
await self._flush_users()
|
||||||
user["username"],
|
|
||||||
user["score"],
|
|
||||||
user["about"],
|
|
||||||
user["location"],
|
|
||||||
user["created_time"],
|
|
||||||
user["skills"],
|
|
||||||
user["github"],
|
|
||||||
user["website"],
|
|
||||||
),
|
|
||||||
)
|
|
||||||
await self._conn.commit()
|
|
||||||
|
|
||||||
async def rant_exists(self, rant_id: int) -> bool:
|
async def rant_exists(self, rant_id: int) -> bool:
|
||||||
async with self._conn.execute(
|
def _sync_check():
|
||||||
"SELECT 1 FROM rants WHERE id = ? LIMIT 1", (rant_id,)
|
table = self._db["rants"]
|
||||||
) as cursor:
|
return table.find_one(id=rant_id) is not None
|
||||||
return await cursor.fetchone() is not None
|
return await asyncio.to_thread(_sync_check)
|
||||||
|
|
||||||
async def user_exists(self, user_id: int) -> bool:
|
async def user_exists(self, user_id: int) -> bool:
|
||||||
async with self._conn.execute(
|
def _sync_check():
|
||||||
"SELECT 1 FROM users WHERE id = ? LIMIT 1", (user_id,)
|
table = self._db["users"]
|
||||||
) as cursor:
|
return table.find_one(id=user_id) is not None
|
||||||
return await cursor.fetchone() is not None
|
return await asyncio.to_thread(_sync_check)
|
||||||
|
|
||||||
async def get_random_user_ids(self, limit: int) -> List[int]:
|
async def get_random_user_ids(self, limit: int) -> List[int]:
|
||||||
logging.info(
|
logging.info(f"Fetching up to {limit} random user IDs from database for seeding...")
|
||||||
f"Fetching up to {limit} random user IDs from database for seeding..."
|
|
||||||
)
|
def _sync_fetch():
|
||||||
query = "SELECT id FROM users ORDER BY RANDOM() LIMIT ?"
|
result = self._db.query(f"SELECT id FROM users ORDER BY RANDOM() LIMIT {limit}")
|
||||||
async with self._conn.execute(query, (limit,)) as cursor:
|
return [row["id"] for row in result]
|
||||||
rows = await cursor.fetchall()
|
|
||||||
user_ids = [row[0] for row in rows]
|
user_ids = await asyncio.to_thread(_sync_fetch)
|
||||||
logging.info(f"Found {len(user_ids)} user IDs to seed.")
|
logging.info(f"Found {len(user_ids)} user IDs to seed.")
|
||||||
return user_ids
|
return user_ids
|
||||||
|
|
||||||
|
async def get_all_rant_ids(self) -> List[int]:
|
||||||
|
def _sync_fetch():
|
||||||
|
result = self._db.query("SELECT id FROM rants")
|
||||||
|
return [row["id"] for row in result]
|
||||||
|
return await asyncio.to_thread(_sync_fetch)
|
||||||
|
|
||||||
|
async def get_all_user_ids(self) -> List[int]:
|
||||||
|
def _sync_fetch():
|
||||||
|
result = self._db.query("SELECT id FROM users")
|
||||||
|
return [row["id"] for row in result]
|
||||||
|
return await asyncio.to_thread(_sync_fetch)
|
||||||
|
|
||||||
|
async def save_crawler_state(self, key: str, value: str):
|
||||||
|
def _sync_save():
|
||||||
|
table = self._db["crawler_state"]
|
||||||
|
table.upsert({"key": key, "value": value}, ["key"])
|
||||||
|
await asyncio.to_thread(_sync_save)
|
||||||
|
|
||||||
|
async def load_crawler_state(self, key: str) -> Optional[str]:
|
||||||
|
def _sync_load():
|
||||||
|
table = self._db["crawler_state"]
|
||||||
|
row = table.find_one(key=key)
|
||||||
|
return row["value"] if row else None
|
||||||
|
return await asyncio.to_thread(_sync_load)
|
||||||
|
|||||||
@ -1,30 +1,33 @@
|
|||||||
# main.py
|
# retoor <retoor@molodetz.nl>
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import signal
|
import signal
|
||||||
|
|
||||||
from crawler import DevRantCrawler
|
from crawler import DevRantCrawler
|
||||||
from database import DatabaseManager
|
from database import DatabaseManager
|
||||||
|
|
||||||
from devranta.api import Api
|
from devranta.api import Api
|
||||||
|
|
||||||
# --- Configuration ---
|
|
||||||
DB_FILE = "devrant.sqlite"
|
DB_FILE = "devrant.sqlite"
|
||||||
CONCURRENT_RANT_CONSUMERS = 10 # How many rants to process at once
|
CONCURRENT_RANT_CONSUMERS = 10
|
||||||
CONCURRENT_USER_CONSUMERS = 5 # How many user profiles to fetch at once
|
CONCURRENT_USER_CONSUMERS = 5
|
||||||
|
BATCH_SIZE = 100
|
||||||
|
FLUSH_INTERVAL = 5.0
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
"""Initializes and runs the crawler."""
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(asctime)s [%(levelname)s] - %(message)s",
|
format="%(asctime)s [%(levelname)s] - %(message)s",
|
||||||
datefmt="%Y-%m-%d %H:%M:%S",
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
)
|
)
|
||||||
|
|
||||||
api = Api()
|
async with Api() as api:
|
||||||
|
async with DatabaseManager(
|
||||||
async with DatabaseManager(DB_FILE) as db:
|
DB_FILE,
|
||||||
|
batch_size=BATCH_SIZE,
|
||||||
|
flush_interval=FLUSH_INTERVAL,
|
||||||
|
) as db:
|
||||||
crawler = DevRantCrawler(
|
crawler = DevRantCrawler(
|
||||||
api=api,
|
api=api,
|
||||||
db=db,
|
db=db,
|
||||||
@ -32,7 +35,6 @@ async def main():
|
|||||||
user_consumers=CONCURRENT_USER_CONSUMERS,
|
user_consumers=CONCURRENT_USER_CONSUMERS,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Set up a signal handler for graceful shutdown on Ctrl+C
|
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||||
loop.add_signal_handler(
|
loop.add_signal_handler(
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
aiosqlite
|
dataset
|
||||||
|
|||||||
@ -1,42 +1,55 @@
|
|||||||
# Princess Bot - Usage and Configuration Guide
|
# Princess Bot
|
||||||
|
|
||||||
|
Author: retoor <retoor@molodetz.nl>
|
||||||
|
|
||||||
|
An automated social media interaction bot for the devRant platform. Monitors a target user's posts and generates LLM-powered responses.
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
Princess.py is an automated social media interaction bot designed to monitor and respond to specific user-generated content (rants and comments) on a platform. It fetches new posts made by a target user, generates witty or devastating responses using a language model, and keeps track of responded messages to avoid duplicates.
|
Princess Bot monitors rants and comments from a specified user on devRant, generates contextual responses using the Grok language model, and posts replies automatically. The bot maintains state to prevent duplicate responses.
|
||||||
|
|
||||||
The bot operates continuously, periodically checking for new content and replying accordingly.
|
## Architecture
|
||||||
|
|
||||||
---
|
The bot operates on a polling model with the following components:
|
||||||
|
|
||||||
## How It Works
|
| Component | Description |
|
||||||
|
|-----------|-------------|
|
||||||
|
| Api | devRant API client for authentication and content retrieval |
|
||||||
|
| GrokAPIClient | LLM integration for response generation |
|
||||||
|
| AsyncDataSet | Async SQLite wrapper for state persistence |
|
||||||
|
|
||||||
1. **Initialization**: The bot initializes with user credentials, target username, and API keys.
|
## Usage
|
||||||
2. **Login**: It logs into the platform via the provided API.
|
|
||||||
3. **Content Monitoring**: It fetches recent rants and comments made by the target user.
|
|
||||||
4. **Response Generation**: For new content (not responded to before), it generates a response using a language model (GrokAPIClient).
|
|
||||||
5. **Response Posting**: It prints the content and the generated reply.
|
|
||||||
6. **Tracking**: It records responded messages in a local database to prevent duplicate responses.
|
|
||||||
7. **Loop**: It repeats this process every 60 seconds.
|
|
||||||
|
|
||||||
---
|
### Quick Start
|
||||||
|
|
||||||
## Configuration
|
```bash
|
||||||
|
make
|
||||||
|
```
|
||||||
|
|
||||||
The script uses a `.env` file to manage sensitive credentials and configurable properties. Below are the supported environment variables:
|
This creates a virtual environment, installs dependencies, and starts the bot.
|
||||||
|
|
||||||
### Required Environment Variables
|
### Manual Setup
|
||||||
|
|
||||||
| Property | Description | Example |
|
```bash
|
||||||
|----------------------|----------------------------------------------------------|-------------------------------------------|
|
python3 -m venv .venv
|
||||||
| `USERNAME` | Your platform username. | `my_username` |
|
source .venv/bin/activate
|
||||||
| `PASSWORD` | Your platform password. | `my_password` |
|
pip install -e ../../.
|
||||||
| `TARGET` | The username of the user to monitor. | `target_user` |
|
pip install -r requirements.txt
|
||||||
| `LLM_KEY` | API key for the language model (Grok API). | `your-grok-api-key` |
|
python princess.py
|
||||||
|
```
|
||||||
|
|
||||||
## Setup Instructions
|
### Configuration
|
||||||
|
|
||||||
1. **Create a `.env` file** in the same directory as `princess.py`.
|
Create a `.env` file with the following variables:
|
||||||
2. **Add the required variables** with your credentials and target info:
|
|
||||||
|
| Variable | Description |
|
||||||
|
|----------|-------------|
|
||||||
|
| `USERNAME` | devRant account username |
|
||||||
|
| `PASSWORD` | devRant account password |
|
||||||
|
| `TARGET` | Username of the user to monitor |
|
||||||
|
| `LLM_KEY` | API key for Grok language model |
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
```env
|
```env
|
||||||
USERNAME=your_username
|
USERNAME=your_username
|
||||||
@ -45,35 +58,42 @@ TARGET=target_username
|
|||||||
LLM_KEY=your_grok_api_key
|
LLM_KEY=your_grok_api_key
|
||||||
```
|
```
|
||||||
|
|
||||||
3. **Install dependencies** (if not already installed):
|
### Stopping
|
||||||
|
|
||||||
|
Press `Ctrl+C` to terminate the bot.
|
||||||
|
|
||||||
|
## Data Storage
|
||||||
|
|
||||||
|
Uses SQLite via AsyncDataSet with:
|
||||||
|
|
||||||
|
- Responded message tracking for deduplication
|
||||||
|
- Persistent state across restarts
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Python 3.10+
|
||||||
|
- python-dotenv
|
||||||
|
- aiosqlite
|
||||||
|
- aiohttp (via parent devranta package)
|
||||||
|
|
||||||
|
## Cleanup
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install python-dotenv
|
make clean
|
||||||
```
|
```
|
||||||
|
|
||||||
4. **Run the script**:
|
Removes the virtual environment. Database file (`princess.db`) is preserved.
|
||||||
|
|
||||||
|
## File Structure
|
||||||
|
|
||||||
```bash
|
|
||||||
python princess.py
|
|
||||||
```
|
```
|
||||||
|
princess/
|
||||||
---
|
├── princess.py # Main bot implementation
|
||||||
|
├── ads.py # AsyncDataSet database wrapper
|
||||||
## Notes
|
├── grk.py # Grok API client
|
||||||
|
├── requirements.txt # Dependencies
|
||||||
- The bot stores responded messages in a local SQLite database (`princess.db`) to avoid duplicate responses.
|
├── Makefile # Build automation
|
||||||
- It runs indefinitely, checking for new content every 60 seconds.
|
├── .env # Configuration (create manually)
|
||||||
- Make sure your API keys and credentials are kept secure and not shared publicly.
|
├── .venv/ # Virtual environment (created on first run)
|
||||||
|
└── princess.db # SQLite database (created on first run)
|
||||||
---
|
```
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
Princess.py is a social media response bot that:
|
|
||||||
|
|
||||||
- Monitors a specific user's posts.
|
|
||||||
- Generates witty responses using a language model.
|
|
||||||
- Keeps track of responses to prevent duplicates.
|
|
||||||
- Runs continuously with minimal setup.
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
|
# retoor <retoor@molodetz.nl>
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ssl
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
||||||
|
|
||||||
@ -119,7 +121,40 @@ class Api:
|
|||||||
self.user_id: Optional[int] = None
|
self.user_id: Optional[int] = None
|
||||||
self.token_id: Optional[int] = None
|
self.token_id: Optional[int] = None
|
||||||
self.token_key: Optional[str] = None
|
self.token_key: Optional[str] = None
|
||||||
self.session: Optional[aiohttp.ClientSession] = None
|
self._session: Optional[aiohttp.ClientSession] = None
|
||||||
|
self._owns_session: bool = False
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
"""Async context manager entry - creates shared HTTP session."""
|
||||||
|
ssl_context = ssl.create_default_context()
|
||||||
|
ssl_context.check_hostname = False
|
||||||
|
ssl_context.verify_mode = ssl.CERT_NONE
|
||||||
|
connector = aiohttp.TCPConnector(ssl=ssl_context)
|
||||||
|
self._session = aiohttp.ClientSession(connector=connector)
|
||||||
|
self._owns_session = True
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
"""Async context manager exit - closes shared HTTP session."""
|
||||||
|
await self.close()
|
||||||
|
|
||||||
|
async def _get_session(self) -> aiohttp.ClientSession:
|
||||||
|
"""Returns or creates a shared HTTP session for connection reuse."""
|
||||||
|
if self._session is None:
|
||||||
|
ssl_context = ssl.create_default_context()
|
||||||
|
ssl_context.check_hostname = False
|
||||||
|
ssl_context.verify_mode = ssl.CERT_NONE
|
||||||
|
connector = aiohttp.TCPConnector(ssl=ssl_context)
|
||||||
|
self._session = aiohttp.ClientSession(connector=connector)
|
||||||
|
self._owns_session = True
|
||||||
|
return self._session
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
"""Closes the HTTP session if owned by this instance."""
|
||||||
|
if self._session and self._owns_session:
|
||||||
|
await self._session.close()
|
||||||
|
self._session = None
|
||||||
|
self._owns_session = False
|
||||||
|
|
||||||
def patch_auth(
|
def patch_auth(
|
||||||
self, request_dict: Optional[Dict[str, Any]] = None
|
self, request_dict: Optional[Dict[str, Any]] = None
|
||||||
@ -177,7 +212,7 @@ class Api:
|
|||||||
"""
|
"""
|
||||||
if not self.username or not self.password:
|
if not self.username or not self.password:
|
||||||
raise Exception("No authentication details supplied.")
|
raise Exception("No authentication details supplied.")
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.post(
|
response = await session.post(
|
||||||
url=self.patch_url("users/auth-token"),
|
url=self.patch_url("users/auth-token"),
|
||||||
data={
|
data={
|
||||||
@ -224,9 +259,9 @@ class Api:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.post(
|
response = await session.post(
|
||||||
url=self.patch_url(f"users"),
|
url=self.patch_url("users"),
|
||||||
data=self.patch_auth(
|
data=self.patch_auth(
|
||||||
{
|
{
|
||||||
"email": email,
|
"email": email,
|
||||||
@ -270,7 +305,7 @@ class Api:
|
|||||||
"""
|
"""
|
||||||
if not await self.ensure_login():
|
if not await self.ensure_login():
|
||||||
return False
|
return False
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.post(
|
response = await session.post(
|
||||||
url=self.patch_url(f"devrant/rants/{rant_id}/comments"),
|
url=self.patch_url(f"devrant/rants/{rant_id}/comments"),
|
||||||
data=self.patch_auth({"comment": comment, "plat": 2}),
|
data=self.patch_auth({"comment": comment, "plat": 2}),
|
||||||
@ -288,7 +323,7 @@ class Api:
|
|||||||
Returns:
|
Returns:
|
||||||
Optional[Comment]: A dictionary representing the comment, or None if not found.
|
Optional[Comment]: A dictionary representing the comment, or None if not found.
|
||||||
"""
|
"""
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.get(
|
response = await session.get(
|
||||||
url=self.patch_url(f"comments/{id_}"), params=self.patch_auth()
|
url=self.patch_url(f"comments/{id_}"), params=self.patch_auth()
|
||||||
)
|
)
|
||||||
@ -307,7 +342,7 @@ class Api:
|
|||||||
"""
|
"""
|
||||||
if not await self.ensure_login():
|
if not await self.ensure_login():
|
||||||
return False
|
return False
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.delete(
|
response = await session.delete(
|
||||||
url=self.patch_url(f"comments/{id_}"), params=self.patch_auth()
|
url=self.patch_url(f"comments/{id_}"), params=self.patch_auth()
|
||||||
)
|
)
|
||||||
@ -324,7 +359,7 @@ class Api:
|
|||||||
Returns:
|
Returns:
|
||||||
Optional[UserProfile]: A dictionary with the user's profile data.
|
Optional[UserProfile]: A dictionary with the user's profile data.
|
||||||
"""
|
"""
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.get(
|
response = await session.get(
|
||||||
url=self.patch_url(f"users/{id_}"), params=self.patch_auth()
|
url=self.patch_url(f"users/{id_}"), params=self.patch_auth()
|
||||||
)
|
)
|
||||||
@ -341,7 +376,7 @@ class Api:
|
|||||||
Returns:
|
Returns:
|
||||||
List[Rant]: A list of rant objects from the search results.
|
List[Rant]: A list of rant objects from the search results.
|
||||||
"""
|
"""
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.get(
|
response = await session.get(
|
||||||
url=self.patch_url("devrant/search"),
|
url=self.patch_url("devrant/search"),
|
||||||
params=self.patch_auth({"term": term}),
|
params=self.patch_auth({"term": term}),
|
||||||
@ -359,7 +394,7 @@ class Api:
|
|||||||
Returns:
|
Returns:
|
||||||
Dict[str, Any]: The full API response object.
|
Dict[str, Any]: The full API response object.
|
||||||
"""
|
"""
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.get(
|
response = await session.get(
|
||||||
self.patch_url(f"devrant/rants/{id}"),
|
self.patch_url(f"devrant/rants/{id}"),
|
||||||
params=self.patch_auth(),
|
params=self.patch_auth(),
|
||||||
@ -380,7 +415,7 @@ class Api:
|
|||||||
Returns:
|
Returns:
|
||||||
List[Rant]: A list of rant objects.
|
List[Rant]: A list of rant objects.
|
||||||
"""
|
"""
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.get(
|
response = await session.get(
|
||||||
url=self.patch_url("devrant/rants"),
|
url=self.patch_url("devrant/rants"),
|
||||||
params=self.patch_auth({"sort": sort, "limit": limit, "skip": skip}),
|
params=self.patch_auth({"sort": sort, "limit": limit, "skip": skip}),
|
||||||
@ -398,7 +433,7 @@ class Api:
|
|||||||
Returns:
|
Returns:
|
||||||
Optional[int]: The user's ID, or None if not found.
|
Optional[int]: The user's ID, or None if not found.
|
||||||
"""
|
"""
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.get(
|
response = await session.get(
|
||||||
url=self.patch_url("get-user-id"),
|
url=self.patch_url("get-user-id"),
|
||||||
params=self.patch_auth({"username": username}),
|
params=self.patch_auth({"username": username}),
|
||||||
@ -431,7 +466,7 @@ class Api:
|
|||||||
"""
|
"""
|
||||||
if not await self.ensure_login():
|
if not await self.ensure_login():
|
||||||
return False
|
return False
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.post(
|
response = await session.post(
|
||||||
url=self.patch_url(f"comments/{comment_id}"),
|
url=self.patch_url(f"comments/{comment_id}"),
|
||||||
data=self.patch_auth({"comment": comment}),
|
data=self.patch_auth({"comment": comment}),
|
||||||
@ -455,7 +490,7 @@ class Api:
|
|||||||
"""
|
"""
|
||||||
if not await self.ensure_login():
|
if not await self.ensure_login():
|
||||||
return False
|
return False
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.post(
|
response = await session.post(
|
||||||
url=self.patch_url(f"devrant/rants/{rant_id}/vote"),
|
url=self.patch_url(f"devrant/rants/{rant_id}/vote"),
|
||||||
data=self.patch_auth(
|
data=self.patch_auth(
|
||||||
@ -484,7 +519,7 @@ class Api:
|
|||||||
"""
|
"""
|
||||||
if not await self.ensure_login():
|
if not await self.ensure_login():
|
||||||
return False
|
return False
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.post(
|
response = await session.post(
|
||||||
url=self.patch_url(f"comments/{comment_id}/vote"),
|
url=self.patch_url(f"comments/{comment_id}/vote"),
|
||||||
data=self.patch_auth(
|
data=self.patch_auth(
|
||||||
@ -503,7 +538,7 @@ class Api:
|
|||||||
"""
|
"""
|
||||||
if not await self.ensure_login():
|
if not await self.ensure_login():
|
||||||
return []
|
return []
|
||||||
async with aiohttp.ClientSession() as session:
|
session = await self._get_session()
|
||||||
response = await session.get(
|
response = await session.get(
|
||||||
url=self.patch_url("users/me/notif-feed"), params=self.patch_auth()
|
url=self.patch_url("users/me/notif-feed"), params=self.patch_auth()
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user