|
# I saved this script as gist because I wrote it a lot of times.
|
|
# It has support for remembering line numbers and so on what is not used.
|
|
# It was originally written in C by me and ported to Python.
|
|
# The original application did use these features.
|
|
|
|
|
|
# Written by retoor@molodetz.nl
|
|
|
|
# This script processes text files to identify words, their positions, and their frequency of occurrence. It uses command-line arguments to query or display popular words and stores results in a SQLite database.
|
|
|
|
# Imports:
|
|
# - argparse: For handling command-line arguments
|
|
# - sqlite3: A library to control and manage SQLite databases
|
|
# - pathlib: To work with filesystem paths in an object-oriented way
|
|
|
|
# MIT License:
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
# of this software and associated documentation files (the "Software"), to deal
|
|
# in the Software without restriction, including without limitation the rights
|
|
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
# copies of the Software, and to permit persons to whom the Software is
|
|
# furnished to do so, subject to the following conditions:
|
|
# The above copyright notice and this permission notice shall be included in all
|
|
# copies or substantial portions of the Software.
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
# SOFTWARE.
|
|
|
|
|
|
import argparse
|
|
import sqlite3
|
|
import pathlib
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--find", type=str, required=False, default="")
|
|
parser.add_argument("--index", action="store_true")
|
|
parser.add_argument("--popular", action="store_true")
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
def is_valid_character(character):
|
|
return character.isalnum() or character == "_"
|
|
|
|
|
|
def process_file(file_path):
|
|
current_word = []
|
|
word_start_position = -1
|
|
word_end_position = 0
|
|
word_line_number = 0
|
|
word_paragraph_number = 0
|
|
word_length = 0
|
|
consecutive_newlines = 0
|
|
position = 0
|
|
line_number = 1
|
|
paragraph_number = 1
|
|
word_counts = {}
|
|
with open(file_path, "r") as file:
|
|
while character := file.read(1):
|
|
position += 1
|
|
is_valid = True
|
|
|
|
if character == ".":
|
|
line_number += 1
|
|
is_valid = False
|
|
|
|
if character == "\n":
|
|
consecutive_newlines += 1
|
|
is_valid = False
|
|
|
|
if not is_valid_character(character):
|
|
is_valid = False
|
|
|
|
if not is_valid:
|
|
if word_start_position > -1:
|
|
word_end_position = position - 1
|
|
word_length = word_end_position - word_start_position
|
|
word_string = "".join(current_word)
|
|
print(
|
|
f"{word_string} {word_start_position} {word_end_position} {word_length} {word_line_number} {word_paragraph_number} {paragraph_number}"
|
|
)
|
|
if word_string not in word_counts:
|
|
word_counts[word_string] = 0
|
|
word_counts[word_string] += 1
|
|
|
|
word_start_position = -1
|
|
current_word = []
|
|
continue
|
|
|
|
if consecutive_newlines >= 2:
|
|
consecutive_newlines = 0
|
|
paragraph_number += 1
|
|
|
|
current_word.append(character)
|
|
|
|
if word_start_position == -1:
|
|
word_start_position = position
|
|
word_line_number = line_number
|
|
word_paragraph_number = paragraph_number
|
|
return word_counts
|
|
|
|
|
|
class WordDatabase:
|
|
def __init__(self, path):
|
|
self.path = path
|
|
self.connection = sqlite3.connect(path)
|
|
self.cursor = self.connection.cursor()
|
|
self.connection.commit()
|
|
|
|
def reset(self):
|
|
self.cursor.execute("DROP TABLE IF EXISTS words")
|
|
self.cursor.execute(
|
|
"""
|
|
CREATE TABLE words (
|
|
word TEXT NOT NULL,
|
|
count INTEGER NOT NULL
|
|
)
|
|
"""
|
|
)
|
|
self.connection.commit()
|
|
|
|
def insert(self, word, count):
|
|
self.cursor.execute(
|
|
"UPDATE words SET count = count + ? WHERE word = ?", (count, word)
|
|
)
|
|
if self.cursor.rowcount == 0:
|
|
self.cursor.execute(
|
|
"INSERT INTO words (word, count) VALUES (?, ?)", (word, count)
|
|
)
|
|
|
|
def commit(self):
|
|
self.connection.commit()
|
|
|
|
def total_count(self):
|
|
self.cursor.execute("SELECT SUM(count) FROM words")
|
|
result = self.cursor.fetchone()
|
|
return result[0] if result else 0
|
|
|
|
def get(self, word):
|
|
self.cursor.execute("SELECT count FROM words WHERE word = ?", (word,))
|
|
result = self.cursor.fetchone()
|
|
return result[0] if result else 0
|
|
|
|
def most_popular(self, count):
|
|
self.cursor.execute(
|
|
"SELECT word, count FROM words ORDER BY count DESC LIMIT ?", (count,)
|
|
)
|
|
return list(self.cursor.fetchall())
|
|
|
|
def __del__(self):
|
|
self.commit()
|
|
self.connection.close()
|
|
print("Database closed")
|
|
|
|
|
|
database = WordDatabase("tags.db")
|
|
|
|
|
|
def index():
|
|
database.connection.execute("BEGIN")
|
|
for file_path in pathlib.Path("logs_plain").iterdir():
|
|
for word, count in process_file(file_path).items():
|
|
database.insert(word, count)
|
|
database.connection.commit()
|
|
|
|
from pprint import pprint as pp
|
|
|
|
pp(database.most_popular(100))
|
|
print(0)
|
|
|
|
|
|
if args.find:
|
|
print(database.get(args.find))
|
|
|
|
if args.popular:
|
|
for item in database.most_popular(300):
|
|
print(item)
|
|
print(database.total_count())
|
|
|
|
if args.index:
|
|
database.reset()
|
|
index()
|