diff --git a/.gitignore b/.gitignore index 863dde4..57213f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ +.history dist venv export -src/drstats/__pycache__ \ No newline at end of file +src/drstats/__pycache__ diff --git a/Makefile b/Makefile index 57563d9..76adc47 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,16 @@ -all: build +all: build sync export_stats export_dataset build: - pip install build - python -m build . - pip install -e . \ No newline at end of file + time pip install build + time python -m build . + time pip install -e . + +sync: + @echo "Synchronizing with devrant.com." + time dr.sync +export_stats: + @echo "Exporting statisticts." + time dr.stats_all +export_dataset: + @echo "Exporting dataset to be used for LLM embedding." + time dr.dataset > export/dataset.txt diff --git a/README.md b/README.md index 5a7f94c..03e9762 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,32 @@ ## About Simple project to determine health of the devrant platform. +Also, it will generate a dataset to be used with machine learning. +Make Retoor9b great again! ## Credits Thanks to Rohan Burke (coolq). The creator of the dr api wrapper this project uses. Since it isn't made like a package, i had to copy his source files to my source folder. His library: https://github.com/coolq1000/devrant-python-api/ +## Using this project + +### Prepare environment +Create python3 environment: +``` +python3 -m venv ./venv +``` +Activate python3 environment: +``` +source ./venv/bin/activate +``` +### Make +You don't have to use more than make. If you just run `make` all statistics will be generated. It will execute the right apps for generating statistics. +### Applications +If you type `dr.` in terminal and press tab you'll see all available apps auto completed. These applications are also used by make. +``` +1. `dr.sync` synchronizes all data from last two weeks from devrant. Only two weeks because it's rate limited. +2. `dr.dataset` exports all data to be used for LLM embedding., don't forget to execute `dr.sync` first. +3. `dr.rant_stats_all` exports all graphs to export folder, don't forget to execute `dr.sync` first. +4. dr.rant_stats_per_day` exports graphs to export folder. don't forget to execute `dr.sync` first. +5.dr.rant_stats_per_hour` exports graphs to export folder. don't forget to execute `dr.sync` first. +6. dr.rant_stats_per_weekday` exports graphs to export folder. don't forget to execute `dr.sync` first. diff --git a/drstats.db b/drstats.db index c065a32..b5c0756 100644 Binary files a/drstats.db and b/drstats.db differ diff --git a/setup.cfg b/setup.cfg index ed38fbb..b3c786b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,4 +27,5 @@ console_scripts = dr.rant_stats_per_day = drstats.statistics:rant_stats_per_day dr.rant_stats_per_weekday = drstats.statistics:rant_stats_per_weekday dr.rant_stats_per_hour = drstats.statistics:rant_stats_per_hour - dr.rant_stats_all = drstats.statistics:rant_stats_all \ No newline at end of file + dr.rant_stats_all = drstats.statistics:rant_stats_all + dr.dataset = drstats.dataset:dump \ No newline at end of file diff --git a/src/drstats.egg-info/PKG-INFO b/src/drstats.egg-info/PKG-INFO index 3a2fc1e..f5b3833 100644 --- a/src/drstats.egg-info/PKG-INFO +++ b/src/drstats.egg-info/PKG-INFO @@ -16,8 +16,32 @@ Requires-Dist: matplotlib>=3.9.2 ## About Simple project to determine health of the devrant platform. +Also, it will generate a dataset to be used with machine learning. +Make Retoor9b great again! ## Credits Thanks to Rohan Burke (coolq). The creator of the dr api wrapper this project uses. Since it isn't made like a package, i had to copy his source files to my source folder. His library: https://github.com/coolq1000/devrant-python-api/ +## Using this project + +### Prepare environment +Create python3 environment: +``` +python3 -m venv ./venv +``` +Activate python3 environment: +``` +source ./venv/bin/activate +``` +### Make +You don't have to use more than make. If you just run `make` all statistics will be generated. It will execute the right apps for generating statistics. +### Applications +If you type `dr.` in terminal and press tab you'll see all available apps auto completed. These applications are also used by make. +``` +1. `dr.sync` synchronizes all data from last two weeks from devrant. Only two weeks because it's rate limited. +2. `dr.dataset` exports all data to be used for LLM embedding., don't forget to execute `dr.sync` first. +3. `dr.rant_stats_all` exports all graphs to export folder, don't forget to execute `dr.sync` first. +4. dr.rant_stats_per_day` exports graphs to export folder. don't forget to execute `dr.sync` first. +5.dr.rant_stats_per_hour` exports graphs to export folder. don't forget to execute `dr.sync` first. +6. dr.rant_stats_per_weekday` exports graphs to export folder. don't forget to execute `dr.sync` first. diff --git a/src/drstats.egg-info/SOURCES.txt b/src/drstats.egg-info/SOURCES.txt index 5c61138..b68d0ba 100644 --- a/src/drstats.egg-info/SOURCES.txt +++ b/src/drstats.egg-info/SOURCES.txt @@ -3,8 +3,11 @@ pyproject.toml setup.cfg src/drstats/__init__.py src/drstats/__main__.py +src/drstats/dataset.py src/drstats/db.py src/drstats/devrant.py +src/drstats/dump_text.py +src/drstats/duration.py src/drstats/statistics.py src/drstats/sync.py src/drstats.egg-info/PKG-INFO diff --git a/src/drstats.egg-info/entry_points.txt b/src/drstats.egg-info/entry_points.txt index 31b927c..a7e6769 100644 --- a/src/drstats.egg-info/entry_points.txt +++ b/src/drstats.egg-info/entry_points.txt @@ -1,4 +1,5 @@ [console_scripts] +dr.dataset = drstats.dataset:dump dr.rant_stats_all = drstats.statistics:rant_stats_all dr.rant_stats_per_day = drstats.statistics:rant_stats_per_day dr.rant_stats_per_hour = drstats.statistics:rant_stats_per_hour diff --git a/src/drstats/__pycache__/db.cpython-312.pyc b/src/drstats/__pycache__/db.cpython-312.pyc index 5e7c225..db03e4b 100644 Binary files a/src/drstats/__pycache__/db.cpython-312.pyc and b/src/drstats/__pycache__/db.cpython-312.pyc differ diff --git a/src/drstats/__pycache__/devrant.cpython-312.pyc b/src/drstats/__pycache__/devrant.cpython-312.pyc index 614fb52..9ab6187 100644 Binary files a/src/drstats/__pycache__/devrant.cpython-312.pyc and b/src/drstats/__pycache__/devrant.cpython-312.pyc differ diff --git a/src/drstats/__pycache__/statistics.cpython-312.pyc b/src/drstats/__pycache__/statistics.cpython-312.pyc index 49d342e..4a78407 100644 Binary files a/src/drstats/__pycache__/statistics.cpython-312.pyc and b/src/drstats/__pycache__/statistics.cpython-312.pyc differ diff --git a/src/drstats/__pycache__/sync.cpython-312.pyc b/src/drstats/__pycache__/sync.cpython-312.pyc index 185c02b..0de808a 100644 Binary files a/src/drstats/__pycache__/sync.cpython-312.pyc and b/src/drstats/__pycache__/sync.cpython-312.pyc differ diff --git a/src/drstats/db.py b/src/drstats/db.py index 1563cfb..765550f 100644 --- a/src/drstats/db.py +++ b/src/drstats/db.py @@ -173,8 +173,21 @@ FROM comments GROUP BY username, DATE(comments.created) ORDER BY username ASC, date ASC; """ - ) - + ) + db.query("DROP VIEW IF EXISTS contributions") + db.query("""CREATE VIEW contributions AS select distinct user_username as username, count(0) as contributions,sum(score) as upvotes,avg(length(text)) as post_length_average, sum(length(text)) as content_length from rants + union + select distinct user_username as username, count(0) as contributions,sum(score) as upvotes, sum(length(body)) / count(0) as post_length_average, sum(length(body)) as content_length from comments + group by username + order by contributions desc, username asc + """); + db.query("DROP VIEW IF EXISTS contributions_extended") + db.query("CREATE VIEW contributions_extended as SELECT username, contributions,ROUND(CAST(contributions AS REAL) / CAST((select contributions from contributions) AS REAL),2) as ownership, upvotes, ROUND(CAST(upvotes AS REAL) / CAST((SELECT SUM(upvotes) from contributions) AS REAL),2) upvotes_ownership, ROUND(CAST(upvotes AS REAL) / CAST(contributions AS REAL),2) upvote_ratio,content_length as post_length_total, ROUND(CAST(content_length AS REAL) / CAST((SELECT SUM(content_length) from contributions) AS REAL)) as ownership_content,post_length_average FROM contributions") + db.query("DROP VIEW IF EXISTS rants_of_user") + db.query("CREATE VIEW rants_of_user as SELECT user_username as username, GROUP_CONCAT(text) as text FROM rants") + db.query("DROP VIEW IF EXISTS posts_of_user") + db.query("CREATE VIEW posts_of_user AS SELECT user_username as username, GROUP_CONCAT(body) as text FROM comments") + return db @@ -183,6 +196,20 @@ class Db: def __init__(self): self.db = None + def __enter__(self): + self.db = get_db() + return self + + def query(self, str): + with Duration("DB Query {}".format(str[:80])): + return self.db.query(str) + + + def __exit__(self, exc_type, exc_val, exc_tb): + self.db.close() + self.db = None + + async def __aenter__(self): self.db = get_db() return self @@ -195,3 +222,38 @@ class Db: async def __aexit__(self, exc_type, exc_val, exc_tb): self.db.close() self.db = None + + +def get_contributions(): + with Db() as db: + contributions = db.query("SELECT ROW_NUMBER() OVER (ORDER BY upvote_ratio DESC) as popularity_postion, * FROM contributions_extended ORDER BY upvote_ratio DESC") + return list(contributions) + +def get_upvote_average(): + return avg(contribution['upvote_ratio'] for contribution in get_contributions()) + +def get_users(): + return list(set([user['username'] for user in get_contributions()])) + +def get_user_count(): + return len(get_users()) + +def get_contribution_count(): + return sum(user['contributions'] for user in get_contributions()) + +def get_contribution_average_per_user(): + return round(get_contribution_count() / get_user_count(),2) + +def get_all_rants_of_user(username): + + with Db() as db: + try: + return db.db['rants_of_user'].find_one(username=username)['text'] + except TypeError: + return "" +def get_all_posts_of_user(username): + with Db() as db: + try: + return db.db['posts_of_user'].find_one(username=username)['text'] + except TypeError: + return "" \ No newline at end of file diff --git a/src/drstats/devrant.py b/src/drstats/devrant.py index ea8d1bb..b05655a 100644 --- a/src/drstats/devrant.py +++ b/src/drstats/devrant.py @@ -33,7 +33,7 @@ class Devrant: url = self.API + "devrant/search" params = {"app": 3, "term": term} - r = requests.get(url, params) + r = requests.get(url, params,timeout=5) obj = json.loads(r.text) return obj @@ -52,7 +52,7 @@ class Devrant: params = { "app": 3, } - r = requests.get(url, params) + r = requests.get(url, params,timeout=5) obj = json.loads(r.text) return obj @@ -65,7 +65,7 @@ class Devrant: url = self.API + "devrant/rants" params = {"app": 3, "sort": sort, "limit": limit, "skip": skip} - r = requests.get(url, params) + r = requests.get(url, params,timeout=5) obj = json.loads(r.text) return obj @@ -80,19 +80,4 @@ class Devrant: r = requests.get(url, params) obj = json.loads(r.text) - return obj - - -if __name__ == "__main__": - # Simple demo, runs through rants sorted by most recent. - dr = Devrant() - i = 0 - while True: - result = dr.get_rant("recent", i) - print("\n" * 50) - name = result["user_username"] - tags = ", ".join(result["tags"]) - print("-" + name + "-" * (50 - (len(name) + 1))) - print(result["text"]) - print("-" + tags + "-" * (50 - (len(tags) + 1))) - i += 1 + return obj \ No newline at end of file diff --git a/src/drstats/statistics.py b/src/drstats/statistics.py index 465c929..52ca78c 100644 --- a/src/drstats/statistics.py +++ b/src/drstats/statistics.py @@ -1,4 +1,4 @@ -from drstats.db import get_db, Db +from drstats.db import get_db, Db,get_users from drstats import sync import asyncio from drstats.duration import Duration @@ -282,25 +282,9 @@ def rant_stats_all(): asyncio.run(comment_stats_per_hour()) asyncio.run(score_most_ignored_last_7_days()) asyncio.run(score_last_7_days()) - asyncio.run(user_score_per_day("retoor")) - asyncio.run(user_score_per_day("Ranchonyx")) - asyncio.run(user_score_per_day("atheist")) - asyncio.run(user_score_per_day("Chewbanacas")) - asyncio.run(user_score_per_day("ScriptCoded")) - asyncio.run(user_score_per_day("bazmd")) - asyncio.run(user_score_per_day("feuerherz")) - asyncio.run(user_score_per_day("D-4got10-01")) - asyncio.run(user_score_per_day("jestdotty")) - asyncio.run(user_score_per_day("Demolishun")) - asyncio.run(user_score_per_day("cafecortado")) - asyncio.run(user_score_per_day("lungdart")) - asyncio.run(user_score_per_day("kiki")) - asyncio.run(user_score_per_day("netikras")) - asyncio.run(user_score_per_day("lorentz")) - asyncio.run(user_score_per_day("12bitfloat")) - asyncio.run(user_score_per_day("root")) - asyncio.run(user_score_per_day("antigermgerm")) - asyncio.run(user_score_per_day("Liebranca")) + for user in get_users(): + asyncio.run(user_score_per_day(user)) + diff --git a/src/drstats/sync.py b/src/drstats/sync.py index 3ad9691..73bf4e4 100644 --- a/src/drstats/sync.py +++ b/src/drstats/sync.py @@ -3,7 +3,7 @@ from drstats.db import get_db import json import asyncio from pprint import pprint as pp - +import requests dr = Devrant() db = get_db() @@ -26,18 +26,21 @@ def timestamp_to_string(timestamp): async def get_recent_rants(start_from=1, page_size=10): page = 0 while True: - rants = dr.get_rants("recent", page_size, start_from)["rants"] - page += 1 - for rant in rants: - if rant is None: - break - rant["tags"] = json.dumps("tags" in rant and rant["tags"] or "") - rant["created"] = timestamp_to_string(rant["created_time"]) - rant = plain_object(rant) - - yield rant - start_from += page_size + try: + rants = dr.get_rants("recent", page_size, start_from)["rants"] + page += 1 + for rant in rants: + if rant is None: + break + rant["tags"] = json.dumps("tags" in rant and rant["tags"] or "") + rant["created"] = timestamp_to_string(rant["created_time"]) + rant = plain_object(rant) + yield rant + start_from += page_size + except requests.exceptions.ConnectionError: + print("Rate limit of server exceeded.") + return async def sync_rants(): count = 0