2025-11-05 14:12:23 +01:00
# I saved this script as gist because I wrote it a lot of times.
2025-01-05 15:57:33 +01:00
# It has support for remembering line numbers and so on what is not used.
# It was originally written in C by me and ported to Python.
2025-11-05 14:12:23 +01:00
# The original application did use these features.
2025-01-05 15:57:33 +01:00
# Written by retoor@molodetz.nl
# This script processes text files to identify words, their positions, and their frequency of occurrence. It uses command-line arguments to query or display popular words and stores results in a SQLite database.
2025-11-05 14:12:23 +01:00
# Imports:
2025-01-05 15:57:33 +01:00
# - argparse: For handling command-line arguments
# - sqlite3: A library to control and manage SQLite databases
# - pathlib: To work with filesystem paths in an object-oriented way
# MIT License:
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import argparse
import sqlite3
2025-11-05 14:12:23 +01:00
import pathlib
2025-01-05 15:57:33 +01:00
parser = argparse . ArgumentParser ( )
2025-11-05 14:12:23 +01:00
parser . add_argument ( " --find " , type = str , required = False , default = " " )
parser . add_argument ( " --index " , action = " store_true " )
parser . add_argument ( " --popular " , action = " store_true " )
2025-01-05 15:57:33 +01:00
args = parser . parse_args ( )
2025-11-05 14:12:23 +01:00
def is_valid_character ( character ) :
return character . isalnum ( ) or character == " _ "
2025-01-05 15:57:33 +01:00
def process_file ( file_path ) :
2025-11-05 14:12:23 +01:00
current_word = [ ]
word_start_position = - 1
word_end_position = 0
word_line_number = 0
word_paragraph_number = 0
2025-01-05 15:57:33 +01:00
word_length = 0
2025-11-05 14:12:23 +01:00
consecutive_newlines = 0
position = 0
line_number = 1
paragraph_number = 1
word_counts = { }
with open ( file_path , " r " ) as file :
while character := file . read ( 1 ) :
position + = 1
is_valid = True
if character == " . " :
line_number + = 1
is_valid = False
if character == " \n " :
consecutive_newlines + = 1
is_valid = False
if not is_valid_character ( character ) :
is_valid = False
if not is_valid :
if word_start_position > - 1 :
word_end_position = position - 1
word_length = word_end_position - word_start_position
word_string = " " . join ( current_word )
print (
f " { word_string } { word_start_position } { word_end_position } { word_length } { word_line_number } { word_paragraph_number } { paragraph_number } "
)
if word_string not in word_counts :
word_counts [ word_string ] = 0
word_counts [ word_string ] + = 1
word_start_position = - 1
current_word = [ ]
2025-01-05 15:57:33 +01:00
continue
2025-11-05 14:12:23 +01:00
if consecutive_newlines > = 2 :
consecutive_newlines = 0
paragraph_number + = 1
current_word . append ( character )
2025-01-05 15:57:33 +01:00
2025-11-05 14:12:23 +01:00
if word_start_position == - 1 :
word_start_position = position
word_line_number = line_number
word_paragraph_number = paragraph_number
return word_counts
2025-01-05 15:57:33 +01:00
2025-11-05 14:12:23 +01:00
class WordDatabase :
2025-01-05 15:57:33 +01:00
def __init__ ( self , path ) :
self . path = path
2025-11-05 14:12:23 +01:00
self . connection = sqlite3 . connect ( path )
self . cursor = self . connection . cursor ( )
self . connection . commit ( )
2025-01-05 15:57:33 +01:00
def reset ( self ) :
self . cursor . execute ( " DROP TABLE IF EXISTS words " )
2025-11-05 14:12:23 +01:00
self . cursor . execute (
"""
2025-01-05 15:57:33 +01:00
CREATE TABLE words (
word TEXT NOT NULL ,
count INTEGER NOT NULL
)
2025-11-05 14:12:23 +01:00
"""
)
self . connection . commit ( )
2025-01-05 15:57:33 +01:00
def insert ( self , word , count ) :
2025-11-05 14:12:23 +01:00
self . cursor . execute (
" UPDATE words SET count = count + ? WHERE word = ? " , ( count , word )
)
if self . cursor . rowcount == 0 :
self . cursor . execute (
" INSERT INTO words (word, count) VALUES (?, ?) " , ( word , count )
)
2025-01-05 15:57:33 +01:00
def commit ( self ) :
2025-11-05 14:12:23 +01:00
self . connection . commit ( )
2025-01-05 15:57:33 +01:00
def total_count ( self ) :
self . cursor . execute ( " SELECT SUM(count) FROM words " )
2025-11-05 14:12:23 +01:00
result = self . cursor . fetchone ( )
return result [ 0 ] if result else 0
2025-01-05 15:57:33 +01:00
def get ( self , word ) :
self . cursor . execute ( " SELECT count FROM words WHERE word = ? " , ( word , ) )
2025-11-05 14:12:23 +01:00
result = self . cursor . fetchone ( )
return result [ 0 ] if result else 0
2025-01-05 15:57:33 +01:00
def most_popular ( self , count ) :
2025-11-05 14:12:23 +01:00
self . cursor . execute (
" SELECT word, count FROM words ORDER BY count DESC LIMIT ? " , ( count , )
)
2025-01-05 15:57:33 +01:00
return list ( self . cursor . fetchall ( ) )
def __del__ ( self ) :
self . commit ( )
2025-11-05 14:12:23 +01:00
self . connection . close ( )
2025-01-05 15:57:33 +01:00
print ( " Database closed " )
2025-11-05 14:12:23 +01:00
database = WordDatabase ( " tags.db " )
2025-01-05 15:57:33 +01:00
def index ( ) :
2025-11-05 14:12:23 +01:00
database . connection . execute ( " BEGIN " )
for file_path in pathlib . Path ( " logs_plain " ) . iterdir ( ) :
for word , count in process_file ( file_path ) . items ( ) :
database . insert ( word , count )
database . connection . commit ( )
2025-01-05 15:57:33 +01:00
from pprint import pprint as pp
2025-11-05 14:12:23 +01:00
pp ( database . most_popular ( 100 ) )
print ( 0 )
2025-01-05 15:57:33 +01:00
if args . find :
2025-11-05 14:12:23 +01:00
print ( database . get ( args . find ) )
2025-01-05 15:57:33 +01:00
if args . popular :
2025-11-05 14:12:23 +01:00
for item in database . most_popular ( 300 ) :
2025-01-05 15:57:33 +01:00
print ( item )
2025-11-05 14:12:23 +01:00
print ( database . total_count ( ) )
2025-01-05 15:57:33 +01:00
if args . index :
2025-11-05 14:12:23 +01:00
database . reset ( )
index ( )