Compare commits

...

27 Commits

Author SHA1 Message Date
JestDotty
5fc6c839a1 notes
Some checks failed
isspam build / build (push) Failing after 4m49s
2025-10-04 10:03:15 -04:00
JestDotty
c459fe6d79 more tests 2025-10-04 09:58:40 -04:00
JestDotty
d73d4ff7c1 reorg 2025-10-04 09:18:03 -04:00
JestDotty
94b786f83a sequential file read slightly faster 2025-03-24 16:03:36 -04:00
JestDotty
422e8ace29 clean up README 2025-03-24 00:48:04 -04:00
JestDotty
c581ca6817 fxhash and I remembered how to static lazy right 2025-03-24 00:20:15 -04:00
JestDotty
41f5398f20 do or do not, there is always a trie organization 2025-03-23 23:58:34 -04:00
JestDotty
55a4901a30 if a word has forbidden only count it once. uses for loops again whoops 2025-03-23 23:23:40 -04:00
JestDotty
320f6bf4af muncher benchmark and data 2025-03-23 23:14:30 -04:00
JestDotty
a699aba7c2 don't track 2025-03-23 22:31:14 -04:00
JestDotty
90cd44f302 optimization benchmarks. switched to tokio 2025-03-23 22:29:54 -04:00
JestDotty
e3c71f8fc9 sync. rayon and build options, README 2025-03-23 21:39:12 -04:00
6ddef94103 updates. 2025-03-24 02:31:31 +01:00
Jest Dotty
0eaf353463 flatten jest_rust 2025-03-23 20:23:38 -04:00
Jest Dotty
1c0fc334e8 jest_rust, build, bench 2025-03-23 20:03:09 -04:00
BordedDev
f1b9005c9c Removed TBB from default makefile 2025-03-23 23:37:31 +01:00
BordedDev
50e01a2fe6 Fixed horrible mistake ;P 2025-03-23 22:40:43 +01:00
BordedDev
f2c0942cc2 Updated file io perf 2025-03-23 22:06:48 +01:00
BordedDev
177a512c38 Tweaked some variables, add main2.cpp which removes the struct 2025-03-23 03:36:29 +01:00
42938575d3 Added py 2025-03-23 03:25:51 +01:00
BordedDev
b14337dc60 Fixed issues with utf8 2025-03-23 02:13:38 +01:00
7dc6143a7f Added python version. 2025-03-22 23:55:48 +01:00
BordedDev
6b148b3235 Updated Dockerfile to use gcc
Added borded cpp to bench
Made borded part of make all
2025-03-20 23:32:24 +01:00
BordedDev
c78d43b593 Added borded cpp to make file 2025-03-20 23:21:33 +01:00
9b1ab5ed0c Docker setup. 2025-03-20 22:52:53 +01:00
748c404404 Retoorii fixes. 2025-03-20 22:23:41 +01:00
BordedDev
8528fe8f0f Setup v1 of borded spam parser 2025-03-20 21:44:22 +01:00
22 changed files with 1675 additions and 1 deletions

3
.gitignore vendored
View File

@ -5,7 +5,10 @@ publish
books books
__pycache__ __pycache__
target target
./isspam.py
isspam isspam
risspam risspam
/jisspam
isspam_cpp isspam_cpp
.build-trigger-2014-12-02 15:26 .build-trigger-2014-12-02 15:26
borded_cpp_exec

View File

@ -1,20 +1,32 @@
CC = gcc CC = gcc
CFLAGS = -Ofast CFLAGS = -Ofast
all: build run valgrind build_risspam run_risspam build_cpp all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
build: build:
@echo "Compiling retoor_c project.". @echo "Compiling retoor_c project.".
@$(CC) $(CFLAGS) retoor_c/isspam.c -o isspam @$(CC) $(CFLAGS) retoor_c/isspam.c -o isspam
build_py:
@echo "Copying py file"
@cp retoor_c/isspam.py isspam.py
build_cpp: build_cpp:
@echo "Compiling C++ version of isspam." @echo "Compiling C++ version of isspam."
@g++ -Ofast retoor_c/isspam.cpp -o isspam_cpp @g++ -Ofast retoor_c/isspam.cpp -o isspam_cpp
build_borded_cpp:
@echo "Compiling Borded C++ version of isspam."
@g++ -std=c++23 -Ofast borded_cpp/src/main3.cpp -o borded_cpp_exec
build_risspam: build_risspam:
@echo "Compiling 12bitfloat_risspam project." @echo "Compiling 12bitfloat_risspam project."
cd 12bitfloat_rust/risspam && cargo run --release && cp target/release/risspam ../../ cd 12bitfloat_rust/risspam && cargo run --release && cp target/release/risspam ../../
build_jest:
@echo "compiling jest_rust project"
cd jest_rust && cargo build --release && cp target/release/jisspam ..
run: run_spam wl run_not_spam run: run_spam wl run_not_spam
run_risspam: run_spam_risspam run_not_spam_risspam run_risspam: run_spam_risspam run_not_spam_risspam

View File

@ -11,4 +11,13 @@ print("Time Rust:",time.time() - time_start)
time_start = time.time() time_start = time.time()
subprocess.check_output('./isspam_cpp books/*.txt', shell=True) subprocess.check_output('./isspam_cpp books/*.txt', shell=True)
print("Time CPP:",time.time() - time_start) print("Time CPP:",time.time() - time_start)
time_start = time.time()
subprocess.check_output('./borded_cpp_exec books/*.txt', shell=True)
print("Time Borded CPP:",time.time() - time_start)
time_start = time.time()
subprocess.check_output('./jisspam books/*.txt', shell=True)
print("Time Jest Rust:", time.time() - time_start)
time_start = time.time()
subprocess.check_output('python3 isspam.py books/*.txt', shell=True)
print("Time Retoor Python:",time.time() - time_start)
print("***end benchmark***") print("***end benchmark***")

97
borded_cpp/.gitignore vendored Normal file
View File

@ -0,0 +1,97 @@
*.d
*.slo
*.lo
*.o
*.obj
*.gch
*.pch
*.so
*.dylib
*.dll
*.mod
*.smod
*.lai
*.la
*.a
*.lib
*.exe
*.out
*.app
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
.idea/**/aws.xml
.idea/**/contentModel.xml
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
.idea/**/gradle.xml
.idea/**/libraries
.idea
cmake-build-*/
.idea/**/mongoSettings.xml
*.iws
out/
.idea_modules/
atlassian-ide-plugin.xml
.idea/replstate.xml
.idea/sonarlint/
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
.idea/httpRequests
.idea/caches/build_file_checksums.ser
*~
.fuse_hidden*
.directory
.Trash-*
.nfs*
CMakeLists.txt.user
CMakeCache.txt
CMakeFiles
CMakeScripts
Testing
Makefile
cmake_install.cmake
install_manifest.txt
compile_commands.json
CTestTestfile.cmake
_deps
CMakeUserPresets.json
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
*.stackdump
[Dd]esktop.ini
$RECYCLE.BIN/
*.cab
*.msi
*.msix
*.msm
*.msp
*.lnk
.DS_Store
.AppleDouble
.LSOverride
Icon
._*
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

27
borded_cpp/CMakeLists.txt Normal file
View File

@ -0,0 +1,27 @@
cmake_minimum_required(VERSION 3.25)
project(isspam)
set(CMAKE_CXX_STANDARD 26)
if (MSVC)
add_compile_options(/W4)
add_compile_options(/WX)
add_compile_options(/external:anglebrackets)
add_compile_options(/external:W0)
add_compile_options(/wd4100)
add_compile_options(/wd5050)
add_definitions(-DWIN32_LEAN_AND_MEAN -DVC_EXTRALEAN)
add_compile_definitions(WIN32_LEAN_AND_MEAN NOMINMAX)
else ()
add_compile_options(-Wall)
add_compile_options(-Wextra)
add_compile_options(-Wpedantic)
# add_compile_options(-Werror)
endif ()
add_executable(${PROJECT_NAME} src/main.cpp)
add_executable(${PROJECT_NAME}3 src/main3.cpp)
if (LINUX)
target_link_libraries(${PROJECT_NAME} tbb)
target_link_libraries(${PROJECT_NAME}3 tbb)
endif ()

3
borded_cpp/Dockerfile Normal file
View File

@ -0,0 +1,3 @@
FROM gcc:latest
RUN apt update && apt install -y cmake gdb
WORKDIR /home

9
borded_cpp/compose.yml Normal file
View File

@ -0,0 +1,9 @@
services:
cpp:
build: .
command: ["sh","doit.sh"]
tty: true
stdin_open: true
volumes:
- ./:/home
- ../books:/books

2
borded_cpp/doit.sh Executable file
View File

@ -0,0 +1,2 @@
rm -rf build | true
mkdir build && cd build && cmake .. && make

221
borded_cpp/src/main.cpp Normal file
View File

@ -0,0 +1,221 @@
#include <string>
#include <string_view>
#include <fstream>
#include <algorithm>
#include <iostream>
#include <numeric>
#include <execution>
#include <format>
#include <codecvt>
#include <ranges>
#ifdef __cpp_lib_print
#include <print>
#else
namespace std {
template <typename T, typename... Args>
inline void print(T format, Args &&... args) {
auto f = std::vformat(format, std::make_format_args(args...));
std::cout << f;
}
template <typename T, typename... Args>
inline void println(T format, Args &&... args) {
auto f = std::vformat(format, std::make_format_args(args...));
std::cout << f << std::endl;
}
}
#endif
constexpr std::array<std::wstring_view, 35> BAD_WORDS = {
L"recovery",
L"techie",
L"http",
L"https",
L"digital",
L"hack",
L"::",
L"//",
L"com",
L"@",
L"crypto",
L"bitcoin",
L"wallet",
L"hacker",
L"welcome",
L"whatsapp",
L"email",
L"cryptocurrency",
L"stolen",
L"freeze",
L"quick",
L"crucial",
L"tracing",
L"scammers",
L"expers",
L"hire",
L"century",
L"transaction",
L"essential",
L"managing",
L"contact",
L"contacting",
L"understanding",
L"assets",
L"funds",
};
constexpr auto SHORTEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::max(),
[](std::size_t current, const std::wstring_view &word) {
return std::min(current, word.size());
}
);
constexpr auto LONGEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::min(),
[](std::size_t current, const std::wstring_view &word) {
return std::max(current, word.size());
}
);
struct AnalysisResult {
std::size_t totalWordCount = 0;
std::size_t totalCapitalizedCount = 0;
std::size_t totalSentenceCount = 0;
std::size_t totalNumberCount = 0;
std::size_t totalForbiddenCount = 0;
std::size_t fileCount = 1;
std::size_t failCount = 0;
operator std::string() const {
return std::format(
"Word Count: {}\nCapitalized Count: {}\nSentence Count: {}\nNumber Count: {}\nForbidden Count: {}\nFile Count: {}\nFail Count: {}",
totalWordCount, totalCapitalizedCount, totalSentenceCount, totalNumberCount, totalForbiddenCount, fileCount, failCount
);
}
friend AnalysisResult operator+(const AnalysisResult &lhs, const AnalysisResult &rhs) {
return {
lhs.totalWordCount + rhs.totalWordCount,
lhs.totalCapitalizedCount + rhs.totalCapitalizedCount,
lhs.totalSentenceCount + rhs.totalSentenceCount,
lhs.totalNumberCount + rhs.totalNumberCount,
lhs.totalForbiddenCount + rhs.totalForbiddenCount,
lhs.fileCount + rhs.fileCount,
lhs.failCount + rhs.failCount
};
};
};
void check_word(std::wstring &word, std::size_t &forbiddenCount) {
if (word.size() < SHORTEST_BAD_WORD || word.size() > LONGEST_BAD_WORD) {
return;
}
std::ranges::transform(word, word.begin(), ::towlower);
if (std::ranges::find(BAD_WORDS, word) != BAD_WORDS.end()) {
forbiddenCount++;
}
// if (std::ranges::find_if(BAD_WORDS, [&word](const std::wstring_view &badWord) {
// return word.contains(badWord);
// }
// ) != BAD_WORDS.end()) {
// forbiddenCount++;
// }
}
AnalysisResult parseFile(const std::string_view &filename) {
std::wifstream file;
// surpress warning of deprecation
#pragma warning(push)
#pragma warning(suppress : 4996)
file.imbue(std::locale(std::locale(), new std::codecvt_utf8<wchar_t>));
#pragma warning(pop)
file.open(std::string(filename));
if (!file.is_open()) {
std::println("File doesn't exist: {}", filename);
return { };
}
AnalysisResult result{ };
bool inWord = false;
bool isDigit = false;
wchar_t c;
std::wstring word;
while (file.get(c)) {
if (c == '.') {
result.totalSentenceCount++;
}
if (std::isspace(c)) {
inWord = false;
isDigit = false;
if (!word.empty()) {
check_word(word, result.totalForbiddenCount);
word.clear();
}
continue;
} else {
if (!inWord) {
result.totalWordCount++;
if (std::isupper(c)) {
result.totalCapitalizedCount++;
}
}
inWord = true;
if (std::isdigit(c) && !isDigit) {
result.totalNumberCount++;
isDigit = true;
}
word.push_back(c);
}
};
// std::cout << "File state: " << file.rdstate() << " EOF" << file.eof() << " Fail" << file.fail() << " Bad" << file.bad() << std::endl;
if (!word.empty()) {
check_word(word, result.totalForbiddenCount);
}
file.close();
if (file.fail() && !file.eof()) {
result.failCount++;
}
return result;
}
int main(const int argc, char *argv[]) {
if (argc < 2) {
std::println("Usage: {} <file1> <file2> ... <fileN>", argv[0]);
return 1;
}
const AnalysisResult result = std::transform_reduce(std::execution::par_unseq, std::next(argv), argv + argc,
AnalysisResult{.fileCount = 0},
std::plus{ },
parseFile
);
double capitalizedPercentage = (result.totalWordCount > 0)
? static_cast<double>(result.totalCapitalizedCount) / result.totalWordCount * 100.0
: 0;
double forbiddenPercentage = (result.totalWordCount > 0)
? static_cast<double>(result.totalForbiddenCount) / result.totalWordCount * 100.0
: 0;
double wordCountPerSentence = (result.totalSentenceCount > 0)
? static_cast<double>(result.totalWordCount) / result.totalSentenceCount
: 0;
std::println("{}\nCapitalized Percentage: {}%\nForbidden Percentage: {}%\nWord Count Per Sentence: {}", std::string(result),
capitalizedPercentage, forbiddenPercentage, wordCountPerSentence
);
return 0;
}

195
borded_cpp/src/main2.cpp Normal file
View File

@ -0,0 +1,195 @@
#include <string>
#include <string_view>
#include <fstream>
#include <algorithm>
#include <iostream>
#include <numeric>
#include <execution>
#include <format>
#include <codecvt>
#include <ranges>
#ifdef __cpp_lib_print
#include <print>
#else
namespace std {
template <typename T, typename... Args>
inline void print(T format, Args &&... args) {
auto f = std::vformat(format, std::make_format_args(args...));
std::cout << f;
}
template <typename T, typename... Args>
inline void println(T format, Args &&... args) {
auto f = std::vformat(format, std::make_format_args(args...));
std::cout << f << std::endl;
}
}
#endif
constexpr std::array<std::wstring_view, 35> BAD_WORDS = {
L"recovery",
L"techie",
L"http",
L"https",
L"digital",
L"hack",
L"::",
L"//",
L"com",
L"@",
L"crypto",
L"bitcoin",
L"wallet",
L"hacker",
L"welcome",
L"whatsapp",
L"email",
L"cryptocurrency",
L"stolen",
L"freeze",
L"quick",
L"crucial",
L"tracing",
L"scammers",
L"expers",
L"hire",
L"century",
L"transaction",
L"essential",
L"managing",
L"contact",
L"contacting",
L"understanding",
L"assets",
L"funds",
};
constexpr auto SHORTEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::max(),
[](std::size_t current, const std::wstring_view &word) {
return std::min(current, word.size());
}
);
constexpr auto LONGEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::min(),
[](std::size_t current, const std::wstring_view &word) {
return std::max(current, word.size());
}
);
std::size_t totalWordCount = 0;
std::size_t totalCapitalizedCount = 0;
std::size_t totalSentenceCount = 0;
std::size_t totalNumberCount = 0;
std::size_t totalForbiddenCount = 0;
std::size_t fileCount = 1;
std::size_t failCount = 0;
void check_word(std::wstring &word, std::size_t &forbiddenCount) {
if (word.size() < SHORTEST_BAD_WORD || word.size() > LONGEST_BAD_WORD) {
return;
}
std::ranges::transform(word, word.begin(), ::towlower);
if (std::ranges::find(BAD_WORDS, word) != BAD_WORDS.end()) {
forbiddenCount++;
}
// if (std::ranges::find_if(BAD_WORDS, [&word](const std::wstring_view &badWord) {
// return word.contains(badWord);
// }
// ) != BAD_WORDS.end()) {
// forbiddenCount++;
// }
}
void parseFile(const std::string_view &filename) {
std::wifstream file;
// surpress warning of deprecation
#pragma warning(push)
#pragma warning(suppress : 4996)
file.imbue(std::locale(std::locale(), new std::codecvt_utf8<wchar_t>));
#pragma warning(pop)
file.open(std::string(filename));
if (!file.is_open()) {
std::println("File doesn't exist: {}", filename);
return;
}
bool inWord = false;
bool isDigit = false;
wchar_t c;
std::wstring word;
while (file.get(c)) {
if (c == '.') {
totalSentenceCount++;
}
if (std::isspace(c)) {
inWord = false;
isDigit = false;
if (!word.empty()) {
check_word(word, totalForbiddenCount);
word.clear();
}
continue;
} else {
if (!inWord) {
totalWordCount++;
if (std::isupper(c)) {
totalCapitalizedCount++;
}
}
inWord = true;
if (std::isdigit(c) && !isDigit) {
totalNumberCount++;
isDigit = true;
}
word.push_back(c);
}
};
// std::cout << "File state: " << file.rdstate() << " EOF" << file.eof() << " Fail" << file.fail() << " Bad" << file.bad() << std::endl;
if (!word.empty()) {
check_word(word, totalForbiddenCount);
}
file.close();
if (file.fail() && !file.eof()) {
failCount++;
}
}
int main(const int argc, char *argv[]) {
if (argc < 2) {
std::println("Usage: {} <file1> <file2> ... <fileN>", argv[0]);
return 1;
}
std::for_each(std::execution::par_unseq, std::next(argv), argv + argc, parseFile);
double capitalizedPercentage = (totalWordCount > 0)
? static_cast<double>(totalCapitalizedCount) / totalWordCount * 100.0
: 0;
double forbiddenPercentage = (totalWordCount > 0)
? static_cast<double>(totalForbiddenCount) / totalWordCount * 100.0
: 0;
double wordCountPerSentence = (totalSentenceCount > 0)
? static_cast<double>(totalWordCount) / totalSentenceCount
: 0;
std::println(
"Word Count: {}\nCapitalized Count: {}\nSentence Count: {}\nNumber Count: {}\nForbidden Count: {}\nFile Count: {}\nFail Count: {}\nCapitalized Percentage: {}%\nForbidden Percentage: {}%\nWord Count Per Sentence: {}",
totalWordCount, totalCapitalizedCount, totalSentenceCount, totalNumberCount, totalForbiddenCount, fileCount, failCount,
capitalizedPercentage, forbiddenPercentage, wordCountPerSentence
);
return 0;
}

576
borded_cpp/src/main3.cpp Normal file
View File

@ -0,0 +1,576 @@
#include <string>
#include <string_view>
#include <fstream>
#include <algorithm>
#include <iostream>
#include <execution>
#include <format>
#include <cstdio>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <aio.h>
#include <condition_variable>
#include <unordered_set>
#include <sys/signal.h>
#ifdef __cpp_lib_print
#include <print>
#else
namespace std {
template <typename T, typename... Args>
inline void print(T format, Args &&... args) {
auto f = std::vformat(format, std::make_format_args(args...));
std::cout << f;
}
template <typename T, typename... Args>
inline void println(T format, Args &&... args) {
auto f = std::vformat(format, std::make_format_args(args...));
std::cout << f << std::endl;
}
}
#endif
constexpr std::array<std::string_view, 35> BAD_WORDS = {
"recovery",
"techie",
"http",
"https",
"digital",
"hack",
"::",
"//",
"com",
"@",
"crypto",
"bitcoin",
"wallet",
"hacker",
"welcome",
"whatsapp",
"email",
"cryptocurrency",
"stolen",
"freeze",
"quick",
"crucial",
"tracing",
"scammers",
"expers",
"hire",
"century",
"transaction",
"essential",
"managing",
"contact",
"contacting",
"understanding",
"assets",
"funds",
};
const std::unordered_set<std::string_view> BAD_WORDS_SET(BAD_WORDS.begin(), BAD_WORDS.end());
static constexpr unsigned int crc_table[256] = {
0x00000000,
0x77073096,
0xee0e612c,
0x990951ba,
0x076dc419,
0x706af48f,
0xe963a535,
0x9e6495a3,
0x0edb8832,
0x79dcb8a4,
0xe0d5e91e,
0x97d2d988,
0x09b64c2b,
0x7eb17cbd,
0xe7b82d07,
0x90bf1d91,
0x1db71064,
0x6ab020f2,
0xf3b97148,
0x84be41de,
0x1adad47d,
0x6ddde4eb,
0xf4d4b551,
0x83d385c7,
0x136c9856,
0x646ba8c0,
0xfd62f97a,
0x8a65c9ec,
0x14015c4f,
0x63066cd9,
0xfa0f3d63,
0x8d080df5,
0x3b6e20c8,
0x4c69105e,
0xd56041e4,
0xa2677172,
0x3c03e4d1,
0x4b04d447,
0xd20d85fd,
0xa50ab56b,
0x35b5a8fa,
0x42b2986c,
0xdbbbc9d6,
0xacbcf940,
0x32d86ce3,
0x45df5c75,
0xdcd60dcf,
0xabd13d59,
0x26d930ac,
0x51de003a,
0xc8d75180,
0xbfd06116,
0x21b4f4b5,
0x56b3c423,
0xcfba9599,
0xb8bda50f,
0x2802b89e,
0x5f058808,
0xc60cd9b2,
0xb10be924,
0x2f6f7c87,
0x58684c11,
0xc1611dab,
0xb6662d3d,
0x76dc4190,
0x01db7106,
0x98d220bc,
0xefd5102a,
0x71b18589,
0x06b6b51f,
0x9fbfe4a5,
0xe8b8d433,
0x7807c9a2,
0x0f00f934,
0x9609a88e,
0xe10e9818,
0x7f6a0dbb,
0x086d3d2d,
0x91646c97,
0xe6635c01,
0x6b6b51f4,
0x1c6c6162,
0x856530d8,
0xf262004e,
0x6c0695ed,
0x1b01a57b,
0x8208f4c1,
0xf50fc457,
0x65b0d9c6,
0x12b7e950,
0x8bbeb8ea,
0xfcb9887c,
0x62dd1ddf,
0x15da2d49,
0x8cd37cf3,
0xfbd44c65,
0x4db26158,
0x3ab551ce,
0xa3bc0074,
0xd4bb30e2,
0x4adfa541,
0x3dd895d7,
0xa4d1c46d,
0xd3d6f4fb,
0x4369e96a,
0x346ed9fc,
0xad678846,
0xda60b8d0,
0x44042d73,
0x33031de5,
0xaa0a4c5f,
0xdd0d7cc9,
0x5005713c,
0x270241aa,
0xbe0b1010,
0xc90c2086,
0x5768b525,
0x206f85b3,
0xb966d409,
0xce61e49f,
0x5edef90e,
0x29d9c998,
0xb0d09822,
0xc7d7a8b4,
0x59b33d17,
0x2eb40d81,
0xb7bd5c3b,
0xc0ba6cad,
0xedb88320,
0x9abfb3b6,
0x03b6e20c,
0x74b1d29a,
0xead54739,
0x9dd277af,
0x04db2615,
0x73dc1683,
0xe3630b12,
0x94643b84,
0x0d6d6a3e,
0x7a6a5aa8,
0xe40ecf0b,
0x9309ff9d,
0x0a00ae27,
0x7d079eb1,
0xf00f9344,
0x8708a3d2,
0x1e01f268,
0x6906c2fe,
0xf762575d,
0x806567cb,
0x196c3671,
0x6e6b06e7,
0xfed41b76,
0x89d32be0,
0x10da7a5a,
0x67dd4acc,
0xf9b9df6f,
0x8ebeeff9,
0x17b7be43,
0x60b08ed5,
0xd6d6a3e8,
0xa1d1937e,
0x38d8c2c4,
0x4fdff252,
0xd1bb67f1,
0xa6bc5767,
0x3fb506dd,
0x48b2364b,
0xd80d2bda,
0xaf0a1b4c,
0x36034af6,
0x41047a60,
0xdf60efc3,
0xa867df55,
0x316e8eef,
0x4669be79,
0xcb61b38c,
0xbc66831a,
0x256fd2a0,
0x5268e236,
0xcc0c7795,
0xbb0b4703,
0x220216b9,
0x5505262f,
0xc5ba3bbe,
0xb2bd0b28,
0x2bb45a92,
0x5cb36a04,
0xc2d7ffa7,
0xb5d0cf31,
0x2cd99e8b,
0x5bdeae1d,
0x9b64c2b0,
0xec63f226,
0x756aa39c,
0x026d930a,
0x9c0906a9,
0xeb0e363f,
0x72076785,
0x05005713,
0x95bf4a82,
0xe2b87a14,
0x7bb12bae,
0x0cb61b38,
0x92d28e9b,
0xe5d5be0d,
0x7cdcefb7,
0x0bdbdf21,
0x86d3d2d4,
0xf1d4e242,
0x68ddb3f8,
0x1fda836e,
0x81be16cd,
0xf6b9265b,
0x6fb077e1,
0x18b74777,
0x88085ae6,
0xff0f6a70,
0x66063bca,
0x11010b5c,
0x8f659eff,
0xf862ae69,
0x616bffd3,
0x166ccf45,
0xa00ae278,
0xd70dd2ee,
0x4e048354,
0x3903b3c2,
0xa7672661,
0xd06016f7,
0x4969474d,
0x3e6e77db,
0xaed16a4a,
0xd9d65adc,
0x40df0b66,
0x37d83bf0,
0xa9bcae53,
0xdebb9ec5,
0x47b2cf7f,
0x30b5ffe9,
0xbdbdf21c,
0xcabac28a,
0x53b39330,
0x24b4a3a6,
0xbad03605,
0xcdd70693,
0x54de5729,
0x23d967bf,
0xb3667a2e,
0xc4614ab8,
0x5d681b02,
0x2a6f2b94,
0xb40bbe37,
0xc30c8ea1,
0x5a05df1b,
0x2d02ef8d
};
constexpr uint32_t crc32(std::string_view str) {
uint32_t crc = 0xffffffff;
for (auto c : str)
crc = (crc >> 8) ^ crc_table[(crc ^ c) & 0xff];
return crc ^ 0xffffffff;
}
constexpr uint32_t crc32(char const *str, const size_t size) {
uint32_t crc = 0xffffffff;
for (size_t i = 0; i < size; ++i)
crc = (crc >> 8) ^ crc_table[(crc ^ str[i]) & 0xff];
return crc ^ 0xffffffff;
}
constexpr std::array<uint32_t, 35> BAD_WORDS_HASH = {
crc32("recovery"),
crc32("techie"),
crc32("http"),
crc32("https"),
crc32("digital"),
crc32("hack"),
crc32("::"),
crc32("//"),
crc32("com"),
crc32("@"),
crc32("crypto"),
crc32("bitcoin"),
crc32("wallet"),
crc32("hacker"),
crc32("welcome"),
crc32("whatsapp"),
crc32("email"),
crc32("cryptocurrency"),
crc32("stolen"),
crc32("freeze"),
crc32("quick"),
crc32("crucial"),
crc32("tracing"),
crc32("scammers"),
crc32("expers"),
crc32("hire"),
crc32("century"),
crc32("transaction"),
crc32("essential"),
crc32("managing"),
crc32("contact"),
crc32("contacting"),
crc32("understanding"),
crc32("assets"),
crc32("funds")
};
const std::unordered_set BAD_WORDS_STR(BAD_WORDS.begin(), BAD_WORDS.end());
constexpr auto SHORTEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::max(),
[](std::size_t current, const std::string_view &word) {
return std::min(current, word.size());
}
);
constexpr auto LONGEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::min(),
[](std::size_t current, const std::string_view &word) {
return std::max(current, word.size());
}
);
int totalWordCount = 0;
int totalCapitalizedCount = 0;
int totalSentenceCount = 0;
int totalNumberCount = 0;
int totalForbiddenCount = 0;
int fileCount = 1;
int failCount = 0;
int done = 0;
struct info {
std::string_view name;
aiocb *cb;
const std::chrono::time_point<std::chrono::steady_clock> start = std::chrono::steady_clock::now();
};
constexpr void check_word_simple(const char *word, const ssize_t size) {
if (size < SHORTEST_BAD_WORD || size > LONGEST_BAD_WORD) {
return;
}
// if (BAD_WORDS_SET.contains(word)) {
// totalForbiddenCount++;
// }
const auto hs = crc32(word, size);
for (int i = 0; i < BAD_WORDS_HASH.size(); ++i) {
if (BAD_WORDS_HASH[i] == hs) {
totalForbiddenCount++;
return;
}
}
}
void read_str(char *str, ssize_t size) {
int mark = -1;
int fileWords = 0;
for (int pos = 0; pos <= size; ++pos) {
char *c = str + pos;
if (*c == '.') {
totalSentenceCount++;
}
if (*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t') {
if (mark != -1) {
check_word_simple(str + mark, pos - mark);
mark = -1;
}
} else if (mark == -1) {
++fileWords;
if (*c >= 'A' && *c <= 'Z') {
totalCapitalizedCount++;
}
mark = pos;
} else if (*c >= '0' && *c <= '9') {
totalNumberCount++;
for (; pos <= size; ++pos) {
c = str + pos;
if (*c == '.') {
totalSentenceCount++;
}
if (*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t') {
break;
}
}
mark = -1;
}
}
if (mark != -1) {
check_word_simple(str + mark, size - mark);
}
totalWordCount += fileWords;
}
void aio_completion_handler(sigval_t sigval) {
fileCount++;
info *data = (info *)sigval.sival_ptr;
auto req = data->cb;
// auto req = (struct aiocb *)sigval.sival_ptr;
/* Did the request complete? */
auto error = aio_error(req);
if (error == 0) {
/* Request completed successfully, get the return status */
// const auto start{std::chrono::steady_clock::now()};
// const std::chrono::duration<double> start_seconds{start - (data->start)};
// std::println("File started {} in {}", data->name, start_seconds.count());
read_str((char *)req->aio_buf, aio_return(req));
// const auto finish{std::chrono::steady_clock::now()};
// const std::chrono::duration<double> elapsed_seconds{finish - (data->start)};
// std::println("File read {} in {}", data->name, elapsed_seconds.count());
} else {
std::println("Error at aio_error ({}): ", error);
failCount++;
}
--done;
}
int main(const int argc, char *argv[]) {
if (argc < 2) {
std::println("Usage: {} <file1> <file2> ... <fileN>", argv[0]);
return 1;
}
done = argc - 1;
// lio_listio
auto aiocb_list = (struct aiocb *)malloc(sizeof(struct aiocb) * (argc - 1));
auto aiocb_list_ptr = (struct aiocb **)malloc(sizeof(struct aiocb *) * (argc - 1));
// char *memchnk = (char *)malloc(5 * 1024 * 1024 * (argc - 1));
for (std::size_t i = 0; i < argc - 1; i++) {
aiocb_list[i].aio_fildes = open(argv[i + 1], O_RDONLY);
aiocb_list[i].aio_offset = 0;
// 5mb
aiocb_list[i].aio_buf = malloc(5 * 1024 * 1024);
aiocb_list[i].aio_nbytes = (5 * 1024 * 1024);;
aiocb_list[i].aio_sigevent.sigev_notify = SIGEV_THREAD;
aiocb_list[i].aio_sigevent.sigev_notify_function = aio_completion_handler;
aiocb_list[i].aio_sigevent.sigev_notify_attributes = nullptr;
// aiocb_list[i].aio_sigevent.sigev_value.sival_ptr = &aiocb_list[i];
aiocb_list[i].aio_sigevent.sigev_value.sival_ptr = new info{
argv[i + 1],
&aiocb_list[i]};
// aiocb_list[i].aio_reqprio = SIGRTMIN;
aiocb_list_ptr[i] = &aiocb_list[i];
}
lio_listio(LIO_WAIT, aiocb_list_ptr, argc - 1, nullptr);
while (done > 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
std::println("Done reading files, {} done", done);
double capitalizedPercentage = (totalWordCount > 0)
? static_cast<double>(totalCapitalizedCount) / totalWordCount * 100.0
: 0;
double forbiddenPercentage = (totalWordCount > 0)
? static_cast<double>(totalForbiddenCount) / totalWordCount * 100.0
: 0;
double wordCountPerSentence = (totalSentenceCount > 0)
? static_cast<double>(totalWordCount) / totalSentenceCount
: 0;
std::println(
"Word Count: {}\nCapitalized Count: {}\nSentence Count: {}\nNumber Count: {}\nForbidden Count: {}\nFile Count: {}\nFail Count: {}\nCapitalized Percentage: {}%\nForbidden Percentage: {}%\nWord Count Per Sentence: {}",
totalWordCount, totalCapitalizedCount, totalSentenceCount, totalNumberCount, totalForbiddenCount, fileCount, failCount,
capitalizedPercentage, forbiddenPercentage, wordCountPerSentence
);
for (std::size_t i = 0; i < argc - 1; i++) {
close(aiocb_list[i].aio_fildes);
free((void *)aiocb_list[i].aio_buf);
}
free(aiocb_list);
free(aiocb_list_ptr);
// free(memchnk);
if (failCount > 0) {
return 1;
}
}

3
jest_rust/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/target
/Cargo.lock
/test_books

15
jest_rust/Cargo.toml Normal file
View File

@ -0,0 +1,15 @@
[package]
name = "jisspam"
version = "0.1.0"
edition = "2024"
[dependencies]
fxhash = "0.2.1"
tokio = { version = "1.44.1", features = ["full"] }
[profile.release]
codegen-units = 1 # less means more compile work but better optimized
lto = "fat" # thin has best performance. fat the worst
strip = true
# opt-level = "z" # slows down
panic = "abort"

84
jest_rust/README.md Normal file
View File

@ -0,0 +1,84 @@
for https://retoor.molodetz.nl/retoor/isspam
extract `../books.tar.gz`
# local machine benchmarks
single threaded: `33.63373279571533`
rayon: `4.294418811798096`
tokio: `4.717588901519775`
tokio:
muncher: `2486ms`
for_loops: `1227ms`
for_loops_forbidden_only: `987ms`
trie creation and stats accumulation take 0ms
FxHashMap faster than BTreeMap
## compile options benchmarks
`lto` thin, fat doesn't change much
`codegen-units` 0, 1 doesn't change much
`opt-level = "z"` slow things down
# ubuntu terminal running
https://snek.molodetz.nl/terminal.html ubuntu running thing instructions:
```
mkdir /project
cd /project
git clone https://retoor.molodetz.nl/retoor/isspam.git
apt install valgrind curl
export RUSTUP_HOME=/project/.rustup
export CARGO_HOME=/project/.cargo
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
. "/project/.cargo/env"
cd isspam
rustup install nightly
rustup default nightly
make
make benchmark
python3 bench.py
```
clone: `git clone https://gitlab.com/jestdotty-group/draft/jisspam.git jest_rust`
edit make: `vi makefile` and add build:
```
build_jest:
@echo "compiling jest_rust project"
cd jest_rust && cargo build --release && cp target/release/jisspam ..
```
append to all script:
```
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
```
add to bench: `vi bench.py`
```py
time_start = time.time()
subprocess.check_output('./jisspam books/*.txt', shell=True)
print("Time Jest Rust:", time.time() - time_start)
```
run: `python3 bench.py`
output looks something like this:
```
***benchmarking***
Time C: 31.315868377685547
Time Rust: 41.232205867767334
Time CPP: 20.1683189868927
Time Borded CPP: 15.468477964401245
Time Jest Rust: 54.74523115158081
Time Retoor Python: 287.63036131858826
***end benchmark***
```
add `/jisspam` to `.gitignore` to not commit the executable accidentally

122
jest_rust/src/main.rs Normal file
View File

@ -0,0 +1,122 @@
mod parser;
mod stats;
mod trie;
use stats::Stats;
use std::{env, fs, sync::LazyLock};
use tokio::sync::mpsc;
use trie::Trie;
static FORBIDDEN_WORDS: LazyLock<Trie> = LazyLock::new(|| {
let mut trie = Trie::default();
for word in [
"recovery",
"techie",
"http",
"https",
"digital",
"hack",
"::",
"//",
"@",
"com",
"crypto",
"bitcoin",
"wallet",
"hacker",
"welcome",
"whatsapp",
"email",
"cryptocurrency",
"stolen",
"freeze",
"quick",
"crucial",
"tracing",
"scammers",
"expers",
"hire",
"century",
"transaction",
"essential",
"managing",
"contact",
"contacting",
"understanding",
"assets",
"funds",
] {
trie.insert(word);
}
trie
});
#[tokio::main]
async fn main() {
let files = env::args().skip(1);
let mut rx = {
let (tx, rx) = mpsc::unbounded_channel();
for file in files {
let tx = tx.clone();
tokio::spawn(async move {
let mut stats = Stats::default();
//reading files in threads doesn't change speed of any sort but oh well
if let Ok(text) = fs::read_to_string(&file) {
stats.file_count += 1;
parser::for_loops::parse(&mut stats, &text);
} else {
stats.failed_file_count += 1;
}
let _ = tx.send(stats);
});
}
rx
};
let mut stats = Stats::default();
while let Some(file_stat) = rx.recv().await {
stats += file_stat;
}
println!("{stats}");
}
/// needs ../books.tar.gz to be extracted into ../books
#[test]
fn test() {
use std::{env, fs, process::Command, time::Instant};
println!("cwd: {}", env::current_dir().unwrap().display());
//compile
let mut compile = Command::new("cargo");
let compile_arged = compile.arg("build").arg("--release");
match compile_arged.output() {
Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)),
Err(err) => eprintln!("compile failed: {err}"),
}
//get test files
let files = fs::read_dir("../books")
.unwrap()
.map(|f| {
f.unwrap()
.path()
.canonicalize()
.unwrap()
.to_str()
.unwrap()
.to_string()
})
.collect::<Vec<_>>();
println!("test files found: {}", files.len());
println!();
//benchmark run
let benchmark = Instant::now();
let mut run = Command::new("target/release/jisspam");
let run_arged = run.args(files);
match run_arged.output() {
Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)),
Err(err) => eprintln!("run failed: {err}"),
}
println!("benchmark: {}ms", benchmark.elapsed().as_millis());
}

View File

@ -0,0 +1,37 @@
use crate::{FORBIDDEN_WORDS, stats::Stats};
#[allow(dead_code)]
/// typically 5000ms
/// with trie this is 1600ms
pub fn parse(stats: &mut Stats, text: &str) {
for sentence in text
.split('.')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
stats.sentence_count += 1;
for word in sentence
.split_ascii_whitespace()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
stats.word_count += 1;
//get all numbers counted
let mut all_capitalized = true;
for char in word.chars() {
if char.is_numeric() {
stats.numeric_count += 1;
all_capitalized = false;
} else if !char.is_ascii_uppercase() {
all_capitalized = false;
}
}
if all_capitalized {
stats.capitalized_count += 1;
}
if FORBIDDEN_WORDS.contains(&word.to_lowercase()) {
stats.forbidden_count += 1;
}
}
}
}

View File

@ -0,0 +1,14 @@
use crate::{FORBIDDEN_WORDS, stats::Stats};
#[allow(dead_code)]
pub fn parse(stats: &mut Stats, text: &str) {
for word in text
.split_ascii_whitespace()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
if FORBIDDEN_WORDS.contains(&word.to_lowercase()) {
stats.forbidden_count += 1;
}
}
}

View File

@ -0,0 +1,3 @@
pub mod for_loops;
pub mod for_loops_forbidden_only;
pub mod muncher;

View File

@ -0,0 +1,66 @@
use crate::{FORBIDDEN_WORDS, stats::Stats};
#[allow(dead_code)]
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
/// 500ms is without forbidden words check, but...
/// 6000ms if adding forbidden words.. so not faster
/// with trie this is 2600ms
pub fn parse(stats: &mut Stats, text: &str) {
let mut capitalized = true;
let mut whitespaced = false;
let mut dotted = false;
let mut word = String::new();
for char in text.chars() {
if whitespaced {
if !char.is_whitespace() {
whitespaced = false; //end whiteness
}
continue;
} else if char.is_whitespace() {
whitespaced = true;
stats.word_count += 1; //end of word
if capitalized {
stats.capitalized_count += 1;
} else {
//reset capitalized word
capitalized = true;
}
let lowercase_word = word.to_lowercase();
if FORBIDDEN_WORDS.contains(&lowercase_word) {
stats.forbidden_count += 1;
}
word = String::new();
continue;
}
if dotted {
if char != '.' {
dotted = false; //end sentencing
}
continue;
} else if char == '.' {
dotted = true;
stats.sentence_count += 1;
stats.word_count += 1; //end of word
if capitalized {
stats.capitalized_count += 1;
} else {
//reset capitalized word
capitalized = true;
}
let lowercase_word = word.to_lowercase();
if FORBIDDEN_WORDS.contains(&lowercase_word) {
stats.forbidden_count += 1;
}
word = String::new();
continue;
}
word += &char.to_string();
if char.is_numeric() {
stats.numeric_count += 1;
capitalized = false;
}
if !char.is_ascii_uppercase() {
capitalized = false;
}
}
}

58
jest_rust/src/stats.rs Normal file
View File

@ -0,0 +1,58 @@
use std::{fmt::Display, ops::AddAssign};
#[derive(Debug, Default)]
pub struct Stats {
pub file_count: u32,
pub failed_file_count: u32,
pub sentence_count: u32,
pub word_count: u32,
pub capitalized_count: u32,
pub numeric_count: u32,
pub forbidden_count: u32,
}
impl AddAssign for Stats {
fn add_assign(&mut self, rhs: Self) {
self.file_count += rhs.file_count;
self.failed_file_count += rhs.failed_file_count;
self.sentence_count += rhs.sentence_count;
self.word_count += rhs.word_count;
self.capitalized_count += rhs.capitalized_count;
self.numeric_count += rhs.numeric_count;
self.forbidden_count += rhs.forbidden_count;
}
}
impl Display for Stats {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "file count: {}", self.file_count)?;
writeln!(f, "failed file count: {}", self.failed_file_count)?;
writeln!(f, "sentence count: {}", self.sentence_count)?;
writeln!(f, "word count: {}", self.word_count)?;
writeln!(f, "capitalized count: {}", self.capitalized_count)?;
writeln!(f, "numeric count: {}", self.numeric_count)?;
writeln!(f, "forbidden count: {}", self.forbidden_count)?;
let word_count = self.word_count as f32;
writeln!(
f,
"words per sentence average: {:.1}",
word_count / self.sentence_count as f32
)?;
writeln!(
f,
"forbidden word percentage: {:.2}%",
(self.forbidden_count as f32 / word_count) * 100.0,
)?;
write!(
f,
"capitalized word percentage: {:.2}%",
(self.capitalized_count as f32 / word_count) * 100.0,
)
}
}

33
jest_rust/src/trie.rs Normal file
View File

@ -0,0 +1,33 @@
use fxhash::FxBuildHasher;
use std::collections::HashMap;
type FxHashMap<K, V> = HashMap<K, V, FxBuildHasher>; //simpler, slightly faster
#[derive(Default, Debug, Clone)]
struct Node {
end: bool,
children: FxHashMap<char, Node>,
}
#[derive(Default, Debug, Clone)]
pub struct Trie {
root: Node,
}
impl Trie {
pub fn insert(&mut self, word: &str) {
let mut node = &mut self.root;
for char in word.chars() {
node = node.children.entry(char).or_default();
}
node.end = true;
}
pub fn contains(&self, word: &str) -> bool {
let mut current_node = &self.root;
for char in word.chars() {
match current_node.children.get(&char) {
Some(node) => current_node = node,
None => return false,
}
}
current_node.end
}
}

85
retoor_c/isspam.py Normal file
View File

@ -0,0 +1,85 @@
import os
import sys
import threading
from concurrent.futures import ThreadPoolExecutor
MAX_TEXT_LENGTH = 1024
FORBIDDEN_WORDS_COUNT = 40
forbidden_words = set([
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds",
])
class AnalysisResult:
def __init__(self, filename):
self.filename = filename
self.total_word_count = 0
self.total_capitalized_count = 0
self.total_sentence_count = 0
self.total_number_count = 0
self.total_forbidden_count = 0
def is_forbidden(word):
return word in forbidden_words
def read_file(filename):
if not os.path.exists(filename):
print(f"File doesn't exist: {filename}")
return None
with open(filename, 'r') as file:
return file.read()
def analyze_file(result):
text = read_file(result.filename)
if text:
result.total_sentence_count = text.count('.')
tokens = text.split()
result.total_word_count = len(tokens)
result.total_capitalized_count = sum(1 for token in tokens if token[0].isupper())
result.total_number_count = sum(1 for token in tokens if any(char.isdigit() for char in token))
result.total_forbidden_count = sum(1 for token in tokens if is_forbidden(token))
def main():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <file1> <file2> ... <fileN>")
return
results = []
with ThreadPoolExecutor() as executor:
futures = []
for filename in sys.argv[1:]:
result = AnalysisResult(filename)
results.append(result)
futures.append(executor.submit(analyze_file, result))
for future in futures:
future.result()
total_word_count = sum(result.total_word_count for result in results)
total_capitalized_count = sum(result.total_capitalized_count for result in results)
total_sentence_count = sum(result.total_sentence_count for result in results)
total_number_count = sum(result.total_number_count for result in results)
total_forbidden_count = sum(result.total_forbidden_count for result in results)
capitalized_percentage = (total_word_count > 0) * (total_capitalized_count / total_word_count * 100.0)
forbidden_percentage = (total_word_count > 0) * (total_forbidden_count / total_word_count * 100.0)
word_count_per_sentence = (total_sentence_count > 0) * (total_word_count / total_sentence_count)
print(f"\nTotal Words: {total_word_count}")
print(f"Total Capitalized words: {total_capitalized_count}")
print(f"Total Sentences: {total_sentence_count}")
print(f"Total Numbers: {total_number_count}")
print(f"Total Forbidden words: {total_forbidden_count}")
print(f"Capitalized percentage: {capitalized_percentage:.6f}%")
print(f"Forbidden percentage: {forbidden_percentage:.6f}%")
print(f"Word count per sentence: {word_count_per_sentence:.6f}")
print(f"Total files read: {len(sys.argv) - 1}")
if __name__ == "__main__":
main()