refactor: improve tokenizer and database classes

This commit is contained in:
retoor 2025-11-05 14:12:23 +01:00
parent 74d8372386
commit aa167aee07
2 changed files with 109 additions and 88 deletions

View File

@ -5,6 +5,14 @@
## Version 0.5.0 - 2025-11-05
Users can now run the application inside a chroot container. Developers can use the new `chroot.py` script to initialize and enter these containers.
**Changes:** 2 files, 118 lines
**Languages:** Markdown (8 lines), Python (110 lines)
## Version 0.4.0 - 2025-11-05 ## Version 0.4.0 - 2025-11-05
Testing has been added to verify the HTTP functionality. This improves the reliability and quality of the HTTP features. Testing has been added to verify the HTTP functionality. This improves the reliability and quality of the HTTP features.

View File

@ -36,138 +36,151 @@ import sqlite3
import pathlib import pathlib
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--find', type=str, required=False, default="") parser.add_argument("--find", type=str, required=False, default="")
parser.add_argument('--index', action='store_true') parser.add_argument("--index", action="store_true")
parser.add_argument('--popular', action='store_true') parser.add_argument("--popular", action="store_true")
args = parser.parse_args() args = parser.parse_args()
def is_valid_char(c):
return c.isalnum() or c == '_' def is_valid_character(character):
return character.isalnum() or character == "_"
def process_file(file_path): def process_file(file_path):
word = [] current_word = []
word_start = -1 word_start_position = -1
word_end = 0 word_end_position = 0
word_line = 0 word_line_number = 0
word_alinia = 0 word_paragraph_number = 0
word_length = 0 word_length = 0
new_line_count = 0 consecutive_newlines = 0
pos = 0 position = 0
line = 1 line_number = 1
alinia = 1 paragraph_number = 1
words = {} word_counts = {}
with open(file_path, 'r') as f: with open(file_path, "r") as file:
while c := f.read(1): while character := file.read(1):
position += 1
is_valid = True
pos += 1 if character == ".":
valid = True line_number += 1
is_valid = False
if c == '.': if character == "\n":
line += 1 consecutive_newlines += 1
valid = False is_valid = False
if c == '\n': if not is_valid_character(character):
new_line_count += 1 is_valid = False
valid = False
if not is_valid_char(c): if not is_valid:
valid = False if word_start_position > -1:
word_end_position = position - 1
word_length = word_end_position - word_start_position
word_string = "".join(current_word)
print(
f"{word_string} {word_start_position} {word_end_position} {word_length} {word_line_number} {word_paragraph_number} {paragraph_number}"
)
if word_string not in word_counts:
word_counts[word_string] = 0
word_counts[word_string] += 1
if not valid: word_start_position = -1
if word_start > -1: current_word = []
word_end = pos - 1
word_length = word_end - word_start
word_str = ''.join(word)
print(f"{word_str} {word_start} {word_end} {word_length} {word_line} {word_alinia} {alinia}")
if word_str not in words:
words[word_str] = 0
words[word_str] += 1
word_start = -1
word = []
continue continue
if new_line_count >= 2: if consecutive_newlines >= 2:
new_line_count = 0 consecutive_newlines = 0
alinia += 1 paragraph_number += 1
word.append(c) current_word.append(character)
if word_start == -1: if word_start_position == -1:
word_start = pos word_start_position = position
word_line = line word_line_number = line_number
word_alinia = alinia word_paragraph_number = paragraph_number
return words return word_counts
class WordDb:
class WordDatabase:
def __init__(self, path): def __init__(self, path):
self.path = path self.path = path
self.conn = sqlite3.connect(path) self.connection = sqlite3.connect(path)
self.cursor = self.conn.cursor() self.cursor = self.connection.cursor()
self.conn.commit() self.connection.commit()
self.words = {}
def reset(self): def reset(self):
self.words = {}
self.cursor.execute("DROP TABLE IF EXISTS words") self.cursor.execute("DROP TABLE IF EXISTS words")
self.cursor.execute(""" self.cursor.execute(
"""
CREATE TABLE words ( CREATE TABLE words (
word TEXT NOT NULL, word TEXT NOT NULL,
count INTEGER NOT NULL count INTEGER NOT NULL
) )
""") """
self.conn.commit() )
self.connection.commit()
def insert(self, word, count): def insert(self, word, count):
if word not in self.words: self.cursor.execute(
self.words[word] = count "UPDATE words SET count = count + ? WHERE word = ?", (count, word)
self.cursor.execute("INSERT INTO words (word, count) VALUES (?, ?)", (word, count)) )
else: if self.cursor.rowcount == 0:
self.words[word] += count self.cursor.execute(
self.cursor.execute("UPDATE words SET count = ? WHERE word = ?", (self.words[word], word)) "INSERT INTO words (word, count) VALUES (?, ?)", (word, count)
)
def commit(self): def commit(self):
self.conn.commit() self.connection.commit()
def total_count(self): def total_count(self):
self.cursor.execute("SELECT SUM(count) FROM words") self.cursor.execute("SELECT SUM(count) FROM words")
return self.cursor.fetchone()[0] result = self.cursor.fetchone()
return result[0] if result else 0
def get(self, word): def get(self, word):
self.cursor.execute("SELECT count FROM words WHERE word = ?", (word,)) self.cursor.execute("SELECT count FROM words WHERE word = ?", (word,))
return self.cursor.fetchone()[0] result = self.cursor.fetchone()
return result[0] if result else 0
def most_popular(self, count): def most_popular(self, count):
self.cursor.execute("SELECT word, count FROM words ORDER BY count DESC LIMIT ?", (count,)) self.cursor.execute(
"SELECT word, count FROM words ORDER BY count DESC LIMIT ?", (count,)
)
return list(self.cursor.fetchall()) return list(self.cursor.fetchall())
def __del__(self): def __del__(self):
self.commit() self.commit()
self.conn.close() self.connection.close()
print("Database closed") print("Database closed")
db = WordDb("tags.db")
database = WordDatabase("tags.db")
def index(): def index():
words = {} database.connection.execute("BEGIN")
for f in pathlib.Path("logs_plain").iterdir(): for file_path in pathlib.Path("logs_plain").iterdir():
for key, value in process_file(f).items(): for word, count in process_file(file_path).items():
db.insert(key, value) database.insert(word, count)
database.connection.commit()
from pprint import pprint as pp from pprint import pprint as pp
pp(db.most_popular(100))
db.commit() pp(database.most_popular(100))
print(len(words.keys())) print(0)
if args.find: if args.find:
print(db.get(args.find)) print(database.get(args.find))
if args.popular: if args.popular:
for item in db.most_popular(300): for item in database.most_popular(300):
print(item) print(item)
print(db.total_count()) print(database.total_count())
if args.index: if args.index:
db.reset() database.reset()
index() index()