diff --git a/tokenizer.py b/tokenizer.py new file mode 100644 index 0000000..dac1199 --- /dev/null +++ b/tokenizer.py @@ -0,0 +1,177 @@ + +# I saved this script as gist because I wrote it a lot of times. +# It has support for remembering line numbers and so on what is not used. +# It was originally written in C by me and ported to Python. +# The original application did use these features. + + +# Written by retoor@molodetz.nl + +# This script processes text files to identify words, their positions, and their frequency of occurrence. It uses command-line arguments to query or display popular words and stores results in a SQLite database. + +# Imports: +# - argparse: For handling command-line arguments +# - sqlite3: A library to control and manage SQLite databases +# - pathlib: To work with filesystem paths in an object-oriented way + +# MIT License: +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +import argparse +import sqlite3 +import pathlib + +parser = argparse.ArgumentParser() +parser.add_argument('--find', type=str, required=False, default="") +parser.add_argument('--index', action='store_true') +parser.add_argument('--popular', action='store_true') + +args = parser.parse_args() + +def is_valid_char(c): + return c.isalnum() or c == '_' + +def process_file(file_path): + word = [] + word_start = -1 + word_end = 0 + word_line = 0 + word_alinia = 0 + word_length = 0 + new_line_count = 0 + pos = 0 + line = 1 + alinia = 1 + words = {} + with open(file_path, 'r') as f: + while True: + c = f.read(1) + if not c: + break + + pos += 1 + valid = True + + if c == '.': + line += 1 + valid = False + + if c == '\n': + new_line_count += 1 + valid = False + + if not is_valid_char(c): + valid = False + + if not valid: + if word_start > -1: + word_end = pos - 1 + word_length = word_end - word_start + word_str = ''.join(word) + print(f"{word_str} {word_start} {word_end} {word_length} {word_line} {word_alinia} {alinia}") + if word_str not in words: + words[word_str] = 0 + words[word_str] += 1 + + word_start = -1 + word = [] + continue + + if new_line_count >= 2: + new_line_count = 0 + alinia += 1 + + word.append(c) + + if word_start == -1: + word_start = pos + word_line = line + word_alinia = alinia + return words + +class WordDb: + def __init__(self, path): + self.path = path + self.conn = sqlite3.connect(path) + self.cursor = self.conn.cursor() + self.conn.commit() + self.words = {} + + def reset(self): + self.words = {} + self.cursor.execute("DROP TABLE IF EXISTS words") + self.cursor.execute(""" + CREATE TABLE words ( + word TEXT NOT NULL, + count INTEGER NOT NULL + ) + """) + self.conn.commit() + + def insert(self, word, count): + if word not in self.words: + self.words[word] = count + self.cursor.execute("INSERT INTO words (word, count) VALUES (?, ?)", (word, count)) + else: + self.words[word] += count + self.cursor.execute("UPDATE words SET count = ? WHERE word = ?", (self.words[word], word)) + + def commit(self): + self.conn.commit() + + def total_count(self): + self.cursor.execute("SELECT SUM(count) FROM words") + return self.cursor.fetchone()[0] + + def get(self, word): + self.cursor.execute("SELECT count FROM words WHERE word = ?", (word,)) + return self.cursor.fetchone()[0] + + def most_popular(self, count): + self.cursor.execute("SELECT word, count FROM words ORDER BY count DESC LIMIT ?", (count,)) + return list(self.cursor.fetchall()) + + def __del__(self): + self.commit() + self.conn.close() + print("Database closed") + +db = WordDb("tags.db") + +def index(): + words = {} + for f in pathlib.Path("logs_plain").iterdir(): + for key, value in process_file(f).items(): + db.insert(key, value) + + from pprint import pprint as pp + pp(db.most_popular(100)) + db.commit() + print(len(words.keys())) + +if args.find: + print(db.get(args.find)) + +if args.popular: + for item in db.most_popular(300): + print(item) + print(db.total_count()) + +if args.index: + db.reset() + index() \ No newline at end of file