refactor: improve tokenizer and database classes
This commit is contained in:
parent
74d8372386
commit
aa167aee07
@ -5,6 +5,14 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Version 0.5.0 - 2025-11-05
|
||||||
|
|
||||||
|
Users can now run the application inside a chroot container. Developers can use the new `chroot.py` script to initialize and enter these containers.
|
||||||
|
|
||||||
|
**Changes:** 2 files, 118 lines
|
||||||
|
**Languages:** Markdown (8 lines), Python (110 lines)
|
||||||
|
|
||||||
## Version 0.4.0 - 2025-11-05
|
## Version 0.4.0 - 2025-11-05
|
||||||
|
|
||||||
Testing has been added to verify the HTTP functionality. This improves the reliability and quality of the HTTP features.
|
Testing has been added to verify the HTTP functionality. This improves the reliability and quality of the HTTP features.
|
||||||
|
|||||||
173
tokenizer.py
173
tokenizer.py
@ -36,138 +36,151 @@ import sqlite3
|
|||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('--find', type=str, required=False, default="")
|
parser.add_argument("--find", type=str, required=False, default="")
|
||||||
parser.add_argument('--index', action='store_true')
|
parser.add_argument("--index", action="store_true")
|
||||||
parser.add_argument('--popular', action='store_true')
|
parser.add_argument("--popular", action="store_true")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
def is_valid_char(c):
|
|
||||||
return c.isalnum() or c == '_'
|
def is_valid_character(character):
|
||||||
|
return character.isalnum() or character == "_"
|
||||||
|
|
||||||
|
|
||||||
def process_file(file_path):
|
def process_file(file_path):
|
||||||
word = []
|
current_word = []
|
||||||
word_start = -1
|
word_start_position = -1
|
||||||
word_end = 0
|
word_end_position = 0
|
||||||
word_line = 0
|
word_line_number = 0
|
||||||
word_alinia = 0
|
word_paragraph_number = 0
|
||||||
word_length = 0
|
word_length = 0
|
||||||
new_line_count = 0
|
consecutive_newlines = 0
|
||||||
pos = 0
|
position = 0
|
||||||
line = 1
|
line_number = 1
|
||||||
alinia = 1
|
paragraph_number = 1
|
||||||
words = {}
|
word_counts = {}
|
||||||
with open(file_path, 'r') as f:
|
with open(file_path, "r") as file:
|
||||||
while c := f.read(1):
|
while character := file.read(1):
|
||||||
|
position += 1
|
||||||
|
is_valid = True
|
||||||
|
|
||||||
pos += 1
|
if character == ".":
|
||||||
valid = True
|
line_number += 1
|
||||||
|
is_valid = False
|
||||||
|
|
||||||
if c == '.':
|
if character == "\n":
|
||||||
line += 1
|
consecutive_newlines += 1
|
||||||
valid = False
|
is_valid = False
|
||||||
|
|
||||||
if c == '\n':
|
if not is_valid_character(character):
|
||||||
new_line_count += 1
|
is_valid = False
|
||||||
valid = False
|
|
||||||
|
|
||||||
if not is_valid_char(c):
|
if not is_valid:
|
||||||
valid = False
|
if word_start_position > -1:
|
||||||
|
word_end_position = position - 1
|
||||||
|
word_length = word_end_position - word_start_position
|
||||||
|
word_string = "".join(current_word)
|
||||||
|
print(
|
||||||
|
f"{word_string} {word_start_position} {word_end_position} {word_length} {word_line_number} {word_paragraph_number} {paragraph_number}"
|
||||||
|
)
|
||||||
|
if word_string not in word_counts:
|
||||||
|
word_counts[word_string] = 0
|
||||||
|
word_counts[word_string] += 1
|
||||||
|
|
||||||
if not valid:
|
word_start_position = -1
|
||||||
if word_start > -1:
|
current_word = []
|
||||||
word_end = pos - 1
|
|
||||||
word_length = word_end - word_start
|
|
||||||
word_str = ''.join(word)
|
|
||||||
print(f"{word_str} {word_start} {word_end} {word_length} {word_line} {word_alinia} {alinia}")
|
|
||||||
if word_str not in words:
|
|
||||||
words[word_str] = 0
|
|
||||||
words[word_str] += 1
|
|
||||||
|
|
||||||
word_start = -1
|
|
||||||
word = []
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if new_line_count >= 2:
|
if consecutive_newlines >= 2:
|
||||||
new_line_count = 0
|
consecutive_newlines = 0
|
||||||
alinia += 1
|
paragraph_number += 1
|
||||||
|
|
||||||
word.append(c)
|
current_word.append(character)
|
||||||
|
|
||||||
if word_start == -1:
|
if word_start_position == -1:
|
||||||
word_start = pos
|
word_start_position = position
|
||||||
word_line = line
|
word_line_number = line_number
|
||||||
word_alinia = alinia
|
word_paragraph_number = paragraph_number
|
||||||
return words
|
return word_counts
|
||||||
|
|
||||||
class WordDb:
|
|
||||||
|
class WordDatabase:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
self.path = path
|
self.path = path
|
||||||
self.conn = sqlite3.connect(path)
|
self.connection = sqlite3.connect(path)
|
||||||
self.cursor = self.conn.cursor()
|
self.cursor = self.connection.cursor()
|
||||||
self.conn.commit()
|
self.connection.commit()
|
||||||
self.words = {}
|
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.words = {}
|
|
||||||
self.cursor.execute("DROP TABLE IF EXISTS words")
|
self.cursor.execute("DROP TABLE IF EXISTS words")
|
||||||
self.cursor.execute("""
|
self.cursor.execute(
|
||||||
|
"""
|
||||||
CREATE TABLE words (
|
CREATE TABLE words (
|
||||||
word TEXT NOT NULL,
|
word TEXT NOT NULL,
|
||||||
count INTEGER NOT NULL
|
count INTEGER NOT NULL
|
||||||
)
|
)
|
||||||
""")
|
"""
|
||||||
self.conn.commit()
|
)
|
||||||
|
self.connection.commit()
|
||||||
|
|
||||||
def insert(self, word, count):
|
def insert(self, word, count):
|
||||||
if word not in self.words:
|
self.cursor.execute(
|
||||||
self.words[word] = count
|
"UPDATE words SET count = count + ? WHERE word = ?", (count, word)
|
||||||
self.cursor.execute("INSERT INTO words (word, count) VALUES (?, ?)", (word, count))
|
)
|
||||||
else:
|
if self.cursor.rowcount == 0:
|
||||||
self.words[word] += count
|
self.cursor.execute(
|
||||||
self.cursor.execute("UPDATE words SET count = ? WHERE word = ?", (self.words[word], word))
|
"INSERT INTO words (word, count) VALUES (?, ?)", (word, count)
|
||||||
|
)
|
||||||
|
|
||||||
def commit(self):
|
def commit(self):
|
||||||
self.conn.commit()
|
self.connection.commit()
|
||||||
|
|
||||||
def total_count(self):
|
def total_count(self):
|
||||||
self.cursor.execute("SELECT SUM(count) FROM words")
|
self.cursor.execute("SELECT SUM(count) FROM words")
|
||||||
return self.cursor.fetchone()[0]
|
result = self.cursor.fetchone()
|
||||||
|
return result[0] if result else 0
|
||||||
|
|
||||||
def get(self, word):
|
def get(self, word):
|
||||||
self.cursor.execute("SELECT count FROM words WHERE word = ?", (word,))
|
self.cursor.execute("SELECT count FROM words WHERE word = ?", (word,))
|
||||||
return self.cursor.fetchone()[0]
|
result = self.cursor.fetchone()
|
||||||
|
return result[0] if result else 0
|
||||||
|
|
||||||
def most_popular(self, count):
|
def most_popular(self, count):
|
||||||
self.cursor.execute("SELECT word, count FROM words ORDER BY count DESC LIMIT ?", (count,))
|
self.cursor.execute(
|
||||||
|
"SELECT word, count FROM words ORDER BY count DESC LIMIT ?", (count,)
|
||||||
|
)
|
||||||
return list(self.cursor.fetchall())
|
return list(self.cursor.fetchall())
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.commit()
|
self.commit()
|
||||||
self.conn.close()
|
self.connection.close()
|
||||||
print("Database closed")
|
print("Database closed")
|
||||||
|
|
||||||
db = WordDb("tags.db")
|
|
||||||
|
database = WordDatabase("tags.db")
|
||||||
|
|
||||||
|
|
||||||
def index():
|
def index():
|
||||||
words = {}
|
database.connection.execute("BEGIN")
|
||||||
for f in pathlib.Path("logs_plain").iterdir():
|
for file_path in pathlib.Path("logs_plain").iterdir():
|
||||||
for key, value in process_file(f).items():
|
for word, count in process_file(file_path).items():
|
||||||
db.insert(key, value)
|
database.insert(word, count)
|
||||||
|
database.connection.commit()
|
||||||
|
|
||||||
from pprint import pprint as pp
|
from pprint import pprint as pp
|
||||||
pp(db.most_popular(100))
|
|
||||||
db.commit()
|
pp(database.most_popular(100))
|
||||||
print(len(words.keys()))
|
print(0)
|
||||||
|
|
||||||
|
|
||||||
if args.find:
|
if args.find:
|
||||||
print(db.get(args.find))
|
print(database.get(args.find))
|
||||||
|
|
||||||
if args.popular:
|
if args.popular:
|
||||||
for item in db.most_popular(300):
|
for item in database.most_popular(300):
|
||||||
print(item)
|
print(item)
|
||||||
print(db.total_count())
|
print(database.total_count())
|
||||||
|
|
||||||
if args.index:
|
if args.index:
|
||||||
db.reset()
|
database.reset()
|
||||||
index()
|
index()
|
||||||
Loading…
Reference in New Issue
Block a user