feat: add devrant mention extractor class

feat: implement json and rss export functionality
feat: add command line interface for running the extractor
maintenance: update dependencies and add type hints
docs: add documentation for the extractor class and methods
This commit is contained in:
retoor 2025-11-16 18:38:54 +01:00
parent 59ada88301
commit a08a567b0f
2 changed files with 209 additions and 0 deletions

View File

@ -8,6 +8,14 @@
## Version 0.8.0 - 2025-11-05
Users can now connect external tools to automate more complex tasks. Developers can integrate new tools using the updated elon.py file.
**Changes:** 2 files, 8 lines
**Languages:** Markdown (8 lines)
## Version 0.7.0 - 2025-11-05
The system can now use external tools to complete tasks. This allows it to handle more complex requests and provide more comprehensive responses.

201
dr.mentions.py Normal file
View File

@ -0,0 +1,201 @@
from typing import List, Dict, Any, Optional, Set
import re
import json
from urllib.request import urlopen, Request
from urllib.error import URLError
from time import sleep
from datetime import datetime
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom import minidom
from json.decoder import JSONDecodeError
class DevRantMentionExtractor:
def __init__(
self, base_url: str = "https://www.devrant.io/api/devrant", app_id: str = "3"
) -> None:
self.base_url: str = base_url
self.app_id: str = app_id
self.mentions: List[Dict[str, Any]] = []
self.seen_mention_ids: Set[str] = set()
def fetch_json(self, url: str) -> Any:
while True:
try:
req: Request = Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urlopen(req) as response:
return json.loads(response.read().decode("utf-8"))
except (URLError, JSONDecodeError) as e:
print(
f"[{datetime.now()}] Error fetching/decoding {url}: {e}. "
"Retrying in 1 second..."
)
sleep(1)
except Exception as e:
print(
f"[{datetime.now()}] Unexpected error in fetch_json "
f"({url}): {e}. Retrying in 1 second..."
)
sleep(1)
def get_rants(self, limit: int = 50, skip: int = 0) -> List[Dict[str, Any]]:
url: str = (
f"{self.base_url}/rants?app={self.app_id}&limit={limit}" f"&skip={skip}"
)
data: Any = self.fetch_json(url)
if data.get("success"):
return data.get("rants", [])
return []
def get_rant_details(self, rant_id: int) -> Optional[Dict[str, Any]]:
url: str = f"{self.base_url}/rants/{rant_id}?app={self.app_id}"
data: Any = self.fetch_json(url)
if data.get("success"):
return data
return None
def extract_mentions_from_text(self, text: str) -> List[str]:
mention_pattern = re.compile(r"@([a-zA-Z0-9_-]+)")
return mention_pattern.findall(text)
def process_rant(self, rant_id: int) -> None:
details: Optional[Dict[str, Any]] = self.get_rant_details(rant_id)
if not details:
print(f"Failed to get details for rant {rant_id}")
return
comments: List[Dict[str, Any]] = details.get("comments", [])
for comment in comments:
comment_body: str = comment.get("body", "")
mentioned_users: List[str] = self.extract_mentions_from_text(comment_body)
if mentioned_users:
from_user: str = comment.get("user_username", "unknown")
created_time: int = comment.get("created_time", 0)
comment_id: int = comment.get("id")
for mentioned_user in mentioned_users:
mention_guid: str = f"{comment_id}-to-{mentioned_user}"
if mention_guid not in self.seen_mention_ids:
self.mentions.append(
{
"from": from_user,
"to": mentioned_user,
"content": comment_body,
"rant_id": rant_id,
"comment_id": comment_id,
"created_time": created_time,
}
)
self.seen_mention_ids.add(mention_guid)
def extract_all_mentions(
self, num_pages: int = 5, limit: int = 50, delay: float = 0.5
) -> List[Dict[str, Any]]:
for page in range(num_pages):
skip: int = page * limit
print(f"Fetching page {page + 1}/{num_pages} (skip={skip})...")
rants: List[Dict[str, Any]] = self.get_rants(limit=limit, skip=skip)
if not rants:
print("No more rants found.")
break
for rant in rants:
rant_id: int = rant.get("id")
print(f"Processing rant {rant_id}...")
self.process_rant(rant_id)
sleep(delay)
return self.mentions
def generate_rss(self, output_file: str = "dr.mentions.xml") -> None:
rss: Element = Element("rss", version="2.0")
channel: Element = SubElement(rss, "channel")
SubElement(channel, "title").text = "devRant Mentions Feed"
SubElement(channel, "link").text = "https://devrant.com"
SubElement(channel, "description").text = (
"Live feed of all @mentions on devRant"
)
SubElement(channel, "lastBuildDate").text = datetime.utcnow().strftime(
"%a, %d %b %Y %H:%M:%S GMT"
)
for mention in self.mentions:
item: Element = SubElement(channel, "item")
title: str = f"{mention['from']} mentioned @{mention['to']}"
SubElement(item, "title").text = title
link: str = f"https://devrant.com/rants/{mention['rant_id']}"
SubElement(item, "link").text = link
description: str = mention["content"]
SubElement(item, "description").text = description
guid: str = f"devrant-mention-{mention['comment_id']}-to-{mention['to']}"
SubElement(item, "guid", isPermaLink="false").text = guid
if mention.get("created_time"):
pub_date: str = datetime.utcfromtimestamp(
mention["created_time"]
).strftime("%a, %d %b %Y %H:%M:%S GMT")
SubElement(item, "pubDate").text = pub_date
xml_string: str = minidom.parseString(tostring(rss)).toprettyxml(indent=" ")
with open(output_file, "w", encoding="utf-8") as f:
f.write(xml_string)
print(f"RSS feed saved to {output_file}")
def save_to_json(self, output_file: str = "mentions.json") -> None:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(self.mentions, f, indent=2, ensure_ascii=False)
print(f"JSON data saved to {output_file}")
def run(
self,
num_pages: int = 5,
json_file: str = "dr.mentions.json",
rss_file: str = "dr.mentions.xml",
) -> List[Dict[str, Any]]:
print(f"[{datetime.now()}] Starting extraction...")
self.mentions = []
self.seen_mention_ids.clear()
self.extract_all_mentions(num_pages=num_pages)
print(f"[{datetime.now()}] Found {len(self.mentions)} mentions total.")
print(f"[{datetime.now()}] Sorting mentions...")
self.mentions.sort(key=lambda m: m.get("created_time", 0), reverse=True)
self.save_to_json(json_file)
self.generate_rss(rss_file)
print(f"[{datetime.now()}] Extraction complete.")
return self.mentions
import time
from datetime import datetime
if __name__ == "__main__":
while True:
try:
extractor: DevRantMentionExtractor = DevRantMentionExtractor()
start_time: float = time.time()
extractor.run(num_pages=5)
duration: float = time.time() - start_time
print(f"[{datetime.now()}] Process took {duration:.2f} seconds")
print(f"[{datetime.now()}] Sleeping for 5 minutes...")
sleep(300)
except KeyboardInterrupt:
print("\nStopping...")
break
except Exception as e:
print(f"[{datetime.now()}] An error occurred: {e}")
print("Retrying in 5 minutes...")
sleep(300)