Compare commits
27 Commits
4415dd26ae
...
5fc6c839a1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5fc6c839a1 | ||
|
|
c459fe6d79 | ||
|
|
d73d4ff7c1 | ||
|
|
94b786f83a | ||
|
|
422e8ace29 | ||
|
|
c581ca6817 | ||
|
|
41f5398f20 | ||
|
|
55a4901a30 | ||
|
|
320f6bf4af | ||
|
|
a699aba7c2 | ||
|
|
90cd44f302 | ||
|
|
e3c71f8fc9 | ||
| 6ddef94103 | |||
|
|
0eaf353463 | ||
|
|
1c0fc334e8 | ||
|
|
f1b9005c9c | ||
|
|
50e01a2fe6 | ||
|
|
f2c0942cc2 | ||
|
|
177a512c38 | ||
| 42938575d3 | |||
|
|
b14337dc60 | ||
| 7dc6143a7f | |||
|
|
6b148b3235 | ||
|
|
c78d43b593 | ||
| 9b1ab5ed0c | |||
| 748c404404 | |||
|
|
8528fe8f0f |
3
.gitignore
vendored
3
.gitignore
vendored
@ -5,7 +5,10 @@ publish
|
|||||||
books
|
books
|
||||||
__pycache__
|
__pycache__
|
||||||
target
|
target
|
||||||
|
./isspam.py
|
||||||
isspam
|
isspam
|
||||||
risspam
|
risspam
|
||||||
|
/jisspam
|
||||||
isspam_cpp
|
isspam_cpp
|
||||||
.build-trigger-2014-12-02 15:26
|
.build-trigger-2014-12-02 15:26
|
||||||
|
borded_cpp_exec
|
||||||
|
|||||||
14
Makefile
14
Makefile
@ -1,20 +1,32 @@
|
|||||||
CC = gcc
|
CC = gcc
|
||||||
CFLAGS = -Ofast
|
CFLAGS = -Ofast
|
||||||
|
|
||||||
all: build run valgrind build_risspam run_risspam build_cpp
|
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
|
||||||
|
|
||||||
build:
|
build:
|
||||||
@echo "Compiling retoor_c project.".
|
@echo "Compiling retoor_c project.".
|
||||||
@$(CC) $(CFLAGS) retoor_c/isspam.c -o isspam
|
@$(CC) $(CFLAGS) retoor_c/isspam.c -o isspam
|
||||||
|
|
||||||
|
build_py:
|
||||||
|
@echo "Copying py file"
|
||||||
|
@cp retoor_c/isspam.py isspam.py
|
||||||
|
|
||||||
build_cpp:
|
build_cpp:
|
||||||
@echo "Compiling C++ version of isspam."
|
@echo "Compiling C++ version of isspam."
|
||||||
@g++ -Ofast retoor_c/isspam.cpp -o isspam_cpp
|
@g++ -Ofast retoor_c/isspam.cpp -o isspam_cpp
|
||||||
|
|
||||||
|
build_borded_cpp:
|
||||||
|
@echo "Compiling Borded C++ version of isspam."
|
||||||
|
@g++ -std=c++23 -Ofast borded_cpp/src/main3.cpp -o borded_cpp_exec
|
||||||
|
|
||||||
build_risspam:
|
build_risspam:
|
||||||
@echo "Compiling 12bitfloat_risspam project."
|
@echo "Compiling 12bitfloat_risspam project."
|
||||||
cd 12bitfloat_rust/risspam && cargo run --release && cp target/release/risspam ../../
|
cd 12bitfloat_rust/risspam && cargo run --release && cp target/release/risspam ../../
|
||||||
|
|
||||||
|
build_jest:
|
||||||
|
@echo "compiling jest_rust project"
|
||||||
|
cd jest_rust && cargo build --release && cp target/release/jisspam ..
|
||||||
|
|
||||||
run: run_spam wl run_not_spam
|
run: run_spam wl run_not_spam
|
||||||
run_risspam: run_spam_risspam run_not_spam_risspam
|
run_risspam: run_spam_risspam run_not_spam_risspam
|
||||||
|
|
||||||
|
|||||||
9
bench.py
9
bench.py
@ -11,4 +11,13 @@ print("Time Rust:",time.time() - time_start)
|
|||||||
time_start = time.time()
|
time_start = time.time()
|
||||||
subprocess.check_output('./isspam_cpp books/*.txt', shell=True)
|
subprocess.check_output('./isspam_cpp books/*.txt', shell=True)
|
||||||
print("Time CPP:",time.time() - time_start)
|
print("Time CPP:",time.time() - time_start)
|
||||||
|
time_start = time.time()
|
||||||
|
subprocess.check_output('./borded_cpp_exec books/*.txt', shell=True)
|
||||||
|
print("Time Borded CPP:",time.time() - time_start)
|
||||||
|
time_start = time.time()
|
||||||
|
subprocess.check_output('./jisspam books/*.txt', shell=True)
|
||||||
|
print("Time Jest Rust:", time.time() - time_start)
|
||||||
|
time_start = time.time()
|
||||||
|
subprocess.check_output('python3 isspam.py books/*.txt', shell=True)
|
||||||
|
print("Time Retoor Python:",time.time() - time_start)
|
||||||
print("***end benchmark***")
|
print("***end benchmark***")
|
||||||
|
|||||||
97
borded_cpp/.gitignore
vendored
Normal file
97
borded_cpp/.gitignore
vendored
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
*.d
|
||||||
|
*.slo
|
||||||
|
*.lo
|
||||||
|
*.o
|
||||||
|
*.obj
|
||||||
|
*.gch
|
||||||
|
*.pch
|
||||||
|
*.so
|
||||||
|
*.dylib
|
||||||
|
*.dll
|
||||||
|
*.mod
|
||||||
|
*.smod
|
||||||
|
*.lai
|
||||||
|
*.la
|
||||||
|
*.a
|
||||||
|
*.lib
|
||||||
|
*.exe
|
||||||
|
*.out
|
||||||
|
*.app
|
||||||
|
.idea/**/workspace.xml
|
||||||
|
.idea/**/tasks.xml
|
||||||
|
.idea/**/usage.statistics.xml
|
||||||
|
.idea/**/dictionaries
|
||||||
|
.idea/**/shelf
|
||||||
|
.idea/**/aws.xml
|
||||||
|
.idea/**/contentModel.xml
|
||||||
|
.idea/**/dataSources/
|
||||||
|
.idea/**/dataSources.ids
|
||||||
|
.idea/**/dataSources.local.xml
|
||||||
|
.idea/**/sqlDataSources.xml
|
||||||
|
.idea/**/dynamic.xml
|
||||||
|
.idea/**/uiDesigner.xml
|
||||||
|
.idea/**/dbnavigator.xml
|
||||||
|
.idea/**/gradle.xml
|
||||||
|
.idea/**/libraries
|
||||||
|
.idea
|
||||||
|
cmake-build-*/
|
||||||
|
.idea/**/mongoSettings.xml
|
||||||
|
*.iws
|
||||||
|
out/
|
||||||
|
.idea_modules/
|
||||||
|
atlassian-ide-plugin.xml
|
||||||
|
.idea/replstate.xml
|
||||||
|
.idea/sonarlint/
|
||||||
|
com_crashlytics_export_strings.xml
|
||||||
|
crashlytics.properties
|
||||||
|
crashlytics-build.properties
|
||||||
|
fabric.properties
|
||||||
|
.idea/httpRequests
|
||||||
|
.idea/caches/build_file_checksums.ser
|
||||||
|
*~
|
||||||
|
.fuse_hidden*
|
||||||
|
.directory
|
||||||
|
.Trash-*
|
||||||
|
.nfs*
|
||||||
|
CMakeLists.txt.user
|
||||||
|
CMakeCache.txt
|
||||||
|
CMakeFiles
|
||||||
|
CMakeScripts
|
||||||
|
Testing
|
||||||
|
Makefile
|
||||||
|
cmake_install.cmake
|
||||||
|
install_manifest.txt
|
||||||
|
compile_commands.json
|
||||||
|
CTestTestfile.cmake
|
||||||
|
_deps
|
||||||
|
CMakeUserPresets.json
|
||||||
|
Thumbs.db
|
||||||
|
Thumbs.db:encryptable
|
||||||
|
ehthumbs.db
|
||||||
|
ehthumbs_vista.db
|
||||||
|
*.stackdump
|
||||||
|
[Dd]esktop.ini
|
||||||
|
$RECYCLE.BIN/
|
||||||
|
*.cab
|
||||||
|
*.msi
|
||||||
|
*.msix
|
||||||
|
*.msm
|
||||||
|
*.msp
|
||||||
|
*.lnk
|
||||||
|
.DS_Store
|
||||||
|
.AppleDouble
|
||||||
|
.LSOverride
|
||||||
|
Icon
|
||||||
|
._*
|
||||||
|
.DocumentRevisions-V100
|
||||||
|
.fseventsd
|
||||||
|
.Spotlight-V100
|
||||||
|
.TemporaryItems
|
||||||
|
.Trashes
|
||||||
|
.VolumeIcon.icns
|
||||||
|
.com.apple.timemachine.donotpresent
|
||||||
|
.AppleDB
|
||||||
|
.AppleDesktop
|
||||||
|
Network Trash Folder
|
||||||
|
Temporary Items
|
||||||
|
.apdisk
|
||||||
27
borded_cpp/CMakeLists.txt
Normal file
27
borded_cpp/CMakeLists.txt
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.25)
|
||||||
|
project(isspam)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 26)
|
||||||
|
if (MSVC)
|
||||||
|
add_compile_options(/W4)
|
||||||
|
add_compile_options(/WX)
|
||||||
|
add_compile_options(/external:anglebrackets)
|
||||||
|
add_compile_options(/external:W0)
|
||||||
|
add_compile_options(/wd4100)
|
||||||
|
add_compile_options(/wd5050)
|
||||||
|
add_definitions(-DWIN32_LEAN_AND_MEAN -DVC_EXTRALEAN)
|
||||||
|
add_compile_definitions(WIN32_LEAN_AND_MEAN NOMINMAX)
|
||||||
|
else ()
|
||||||
|
add_compile_options(-Wall)
|
||||||
|
add_compile_options(-Wextra)
|
||||||
|
add_compile_options(-Wpedantic)
|
||||||
|
# add_compile_options(-Werror)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
add_executable(${PROJECT_NAME} src/main.cpp)
|
||||||
|
add_executable(${PROJECT_NAME}3 src/main3.cpp)
|
||||||
|
|
||||||
|
if (LINUX)
|
||||||
|
target_link_libraries(${PROJECT_NAME} tbb)
|
||||||
|
target_link_libraries(${PROJECT_NAME}3 tbb)
|
||||||
|
endif ()
|
||||||
3
borded_cpp/Dockerfile
Normal file
3
borded_cpp/Dockerfile
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
FROM gcc:latest
|
||||||
|
RUN apt update && apt install -y cmake gdb
|
||||||
|
WORKDIR /home
|
||||||
9
borded_cpp/compose.yml
Normal file
9
borded_cpp/compose.yml
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
services:
|
||||||
|
cpp:
|
||||||
|
build: .
|
||||||
|
command: ["sh","doit.sh"]
|
||||||
|
tty: true
|
||||||
|
stdin_open: true
|
||||||
|
volumes:
|
||||||
|
- ./:/home
|
||||||
|
- ../books:/books
|
||||||
2
borded_cpp/doit.sh
Executable file
2
borded_cpp/doit.sh
Executable file
@ -0,0 +1,2 @@
|
|||||||
|
rm -rf build | true
|
||||||
|
mkdir build && cd build && cmake .. && make
|
||||||
221
borded_cpp/src/main.cpp
Normal file
221
borded_cpp/src/main.cpp
Normal file
@ -0,0 +1,221 @@
|
|||||||
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
|
#include <fstream>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <numeric>
|
||||||
|
#include <execution>
|
||||||
|
#include <format>
|
||||||
|
#include <codecvt>
|
||||||
|
#include <ranges>
|
||||||
|
|
||||||
|
#ifdef __cpp_lib_print
|
||||||
|
#include <print>
|
||||||
|
#else
|
||||||
|
namespace std {
|
||||||
|
template <typename T, typename... Args>
|
||||||
|
inline void print(T format, Args &&... args) {
|
||||||
|
auto f = std::vformat(format, std::make_format_args(args...));
|
||||||
|
std::cout << f;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename... Args>
|
||||||
|
inline void println(T format, Args &&... args) {
|
||||||
|
auto f = std::vformat(format, std::make_format_args(args...));
|
||||||
|
std::cout << f << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
constexpr std::array<std::wstring_view, 35> BAD_WORDS = {
|
||||||
|
L"recovery",
|
||||||
|
L"techie",
|
||||||
|
L"http",
|
||||||
|
L"https",
|
||||||
|
L"digital",
|
||||||
|
L"hack",
|
||||||
|
L"::",
|
||||||
|
L"//",
|
||||||
|
L"com",
|
||||||
|
L"@",
|
||||||
|
L"crypto",
|
||||||
|
L"bitcoin",
|
||||||
|
L"wallet",
|
||||||
|
L"hacker",
|
||||||
|
L"welcome",
|
||||||
|
L"whatsapp",
|
||||||
|
L"email",
|
||||||
|
L"cryptocurrency",
|
||||||
|
L"stolen",
|
||||||
|
L"freeze",
|
||||||
|
L"quick",
|
||||||
|
L"crucial",
|
||||||
|
L"tracing",
|
||||||
|
L"scammers",
|
||||||
|
L"expers",
|
||||||
|
L"hire",
|
||||||
|
L"century",
|
||||||
|
L"transaction",
|
||||||
|
L"essential",
|
||||||
|
L"managing",
|
||||||
|
L"contact",
|
||||||
|
L"contacting",
|
||||||
|
L"understanding",
|
||||||
|
L"assets",
|
||||||
|
L"funds",
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr auto SHORTEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::max(),
|
||||||
|
[](std::size_t current, const std::wstring_view &word) {
|
||||||
|
return std::min(current, word.size());
|
||||||
|
}
|
||||||
|
);
|
||||||
|
constexpr auto LONGEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::min(),
|
||||||
|
[](std::size_t current, const std::wstring_view &word) {
|
||||||
|
return std::max(current, word.size());
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
struct AnalysisResult {
|
||||||
|
std::size_t totalWordCount = 0;
|
||||||
|
std::size_t totalCapitalizedCount = 0;
|
||||||
|
std::size_t totalSentenceCount = 0;
|
||||||
|
std::size_t totalNumberCount = 0;
|
||||||
|
std::size_t totalForbiddenCount = 0;
|
||||||
|
std::size_t fileCount = 1;
|
||||||
|
|
||||||
|
std::size_t failCount = 0;
|
||||||
|
|
||||||
|
operator std::string() const {
|
||||||
|
return std::format(
|
||||||
|
"Word Count: {}\nCapitalized Count: {}\nSentence Count: {}\nNumber Count: {}\nForbidden Count: {}\nFile Count: {}\nFail Count: {}",
|
||||||
|
totalWordCount, totalCapitalizedCount, totalSentenceCount, totalNumberCount, totalForbiddenCount, fileCount, failCount
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
friend AnalysisResult operator+(const AnalysisResult &lhs, const AnalysisResult &rhs) {
|
||||||
|
return {
|
||||||
|
lhs.totalWordCount + rhs.totalWordCount,
|
||||||
|
lhs.totalCapitalizedCount + rhs.totalCapitalizedCount,
|
||||||
|
lhs.totalSentenceCount + rhs.totalSentenceCount,
|
||||||
|
lhs.totalNumberCount + rhs.totalNumberCount,
|
||||||
|
lhs.totalForbiddenCount + rhs.totalForbiddenCount,
|
||||||
|
lhs.fileCount + rhs.fileCount,
|
||||||
|
lhs.failCount + rhs.failCount
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
void check_word(std::wstring &word, std::size_t &forbiddenCount) {
|
||||||
|
if (word.size() < SHORTEST_BAD_WORD || word.size() > LONGEST_BAD_WORD) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::ranges::transform(word, word.begin(), ::towlower);
|
||||||
|
if (std::ranges::find(BAD_WORDS, word) != BAD_WORDS.end()) {
|
||||||
|
forbiddenCount++;
|
||||||
|
}
|
||||||
|
// if (std::ranges::find_if(BAD_WORDS, [&word](const std::wstring_view &badWord) {
|
||||||
|
// return word.contains(badWord);
|
||||||
|
// }
|
||||||
|
// ) != BAD_WORDS.end()) {
|
||||||
|
// forbiddenCount++;
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
AnalysisResult parseFile(const std::string_view &filename) {
|
||||||
|
std::wifstream file;
|
||||||
|
|
||||||
|
// surpress warning of deprecation
|
||||||
|
#pragma warning(push)
|
||||||
|
#pragma warning(suppress : 4996)
|
||||||
|
file.imbue(std::locale(std::locale(), new std::codecvt_utf8<wchar_t>));
|
||||||
|
#pragma warning(pop)
|
||||||
|
|
||||||
|
file.open(std::string(filename));
|
||||||
|
if (!file.is_open()) {
|
||||||
|
std::println("File doesn't exist: {}", filename);
|
||||||
|
return { };
|
||||||
|
}
|
||||||
|
|
||||||
|
AnalysisResult result{ };
|
||||||
|
|
||||||
|
bool inWord = false;
|
||||||
|
bool isDigit = false;
|
||||||
|
wchar_t c;
|
||||||
|
|
||||||
|
std::wstring word;
|
||||||
|
while (file.get(c)) {
|
||||||
|
if (c == '.') {
|
||||||
|
result.totalSentenceCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (std::isspace(c)) {
|
||||||
|
inWord = false;
|
||||||
|
isDigit = false;
|
||||||
|
|
||||||
|
if (!word.empty()) {
|
||||||
|
check_word(word, result.totalForbiddenCount);
|
||||||
|
word.clear();
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
if (!inWord) {
|
||||||
|
result.totalWordCount++;
|
||||||
|
if (std::isupper(c)) {
|
||||||
|
result.totalCapitalizedCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inWord = true;
|
||||||
|
|
||||||
|
if (std::isdigit(c) && !isDigit) {
|
||||||
|
result.totalNumberCount++;
|
||||||
|
isDigit = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
word.push_back(c);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// std::cout << "File state: " << file.rdstate() << " EOF" << file.eof() << " Fail" << file.fail() << " Bad" << file.bad() << std::endl;
|
||||||
|
|
||||||
|
if (!word.empty()) {
|
||||||
|
check_word(word, result.totalForbiddenCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
file.close();
|
||||||
|
|
||||||
|
if (file.fail() && !file.eof()) {
|
||||||
|
result.failCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(const int argc, char *argv[]) {
|
||||||
|
if (argc < 2) {
|
||||||
|
std::println("Usage: {} <file1> <file2> ... <fileN>", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const AnalysisResult result = std::transform_reduce(std::execution::par_unseq, std::next(argv), argv + argc,
|
||||||
|
AnalysisResult{.fileCount = 0},
|
||||||
|
std::plus{ },
|
||||||
|
parseFile
|
||||||
|
);
|
||||||
|
|
||||||
|
double capitalizedPercentage = (result.totalWordCount > 0)
|
||||||
|
? static_cast<double>(result.totalCapitalizedCount) / result.totalWordCount * 100.0
|
||||||
|
: 0;
|
||||||
|
double forbiddenPercentage = (result.totalWordCount > 0)
|
||||||
|
? static_cast<double>(result.totalForbiddenCount) / result.totalWordCount * 100.0
|
||||||
|
: 0;
|
||||||
|
double wordCountPerSentence = (result.totalSentenceCount > 0)
|
||||||
|
? static_cast<double>(result.totalWordCount) / result.totalSentenceCount
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
std::println("{}\nCapitalized Percentage: {}%\nForbidden Percentage: {}%\nWord Count Per Sentence: {}", std::string(result),
|
||||||
|
capitalizedPercentage, forbiddenPercentage, wordCountPerSentence
|
||||||
|
);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
195
borded_cpp/src/main2.cpp
Normal file
195
borded_cpp/src/main2.cpp
Normal file
@ -0,0 +1,195 @@
|
|||||||
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
|
#include <fstream>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <numeric>
|
||||||
|
#include <execution>
|
||||||
|
#include <format>
|
||||||
|
#include <codecvt>
|
||||||
|
#include <ranges>
|
||||||
|
|
||||||
|
#ifdef __cpp_lib_print
|
||||||
|
#include <print>
|
||||||
|
#else
|
||||||
|
namespace std {
|
||||||
|
template <typename T, typename... Args>
|
||||||
|
inline void print(T format, Args &&... args) {
|
||||||
|
auto f = std::vformat(format, std::make_format_args(args...));
|
||||||
|
std::cout << f;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename... Args>
|
||||||
|
inline void println(T format, Args &&... args) {
|
||||||
|
auto f = std::vformat(format, std::make_format_args(args...));
|
||||||
|
std::cout << f << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
constexpr std::array<std::wstring_view, 35> BAD_WORDS = {
|
||||||
|
L"recovery",
|
||||||
|
L"techie",
|
||||||
|
L"http",
|
||||||
|
L"https",
|
||||||
|
L"digital",
|
||||||
|
L"hack",
|
||||||
|
L"::",
|
||||||
|
L"//",
|
||||||
|
L"com",
|
||||||
|
L"@",
|
||||||
|
L"crypto",
|
||||||
|
L"bitcoin",
|
||||||
|
L"wallet",
|
||||||
|
L"hacker",
|
||||||
|
L"welcome",
|
||||||
|
L"whatsapp",
|
||||||
|
L"email",
|
||||||
|
L"cryptocurrency",
|
||||||
|
L"stolen",
|
||||||
|
L"freeze",
|
||||||
|
L"quick",
|
||||||
|
L"crucial",
|
||||||
|
L"tracing",
|
||||||
|
L"scammers",
|
||||||
|
L"expers",
|
||||||
|
L"hire",
|
||||||
|
L"century",
|
||||||
|
L"transaction",
|
||||||
|
L"essential",
|
||||||
|
L"managing",
|
||||||
|
L"contact",
|
||||||
|
L"contacting",
|
||||||
|
L"understanding",
|
||||||
|
L"assets",
|
||||||
|
L"funds",
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr auto SHORTEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::max(),
|
||||||
|
[](std::size_t current, const std::wstring_view &word) {
|
||||||
|
return std::min(current, word.size());
|
||||||
|
}
|
||||||
|
);
|
||||||
|
constexpr auto LONGEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::min(),
|
||||||
|
[](std::size_t current, const std::wstring_view &word) {
|
||||||
|
return std::max(current, word.size());
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
std::size_t totalWordCount = 0;
|
||||||
|
std::size_t totalCapitalizedCount = 0;
|
||||||
|
std::size_t totalSentenceCount = 0;
|
||||||
|
std::size_t totalNumberCount = 0;
|
||||||
|
std::size_t totalForbiddenCount = 0;
|
||||||
|
std::size_t fileCount = 1;
|
||||||
|
|
||||||
|
std::size_t failCount = 0;
|
||||||
|
|
||||||
|
void check_word(std::wstring &word, std::size_t &forbiddenCount) {
|
||||||
|
if (word.size() < SHORTEST_BAD_WORD || word.size() > LONGEST_BAD_WORD) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::ranges::transform(word, word.begin(), ::towlower);
|
||||||
|
if (std::ranges::find(BAD_WORDS, word) != BAD_WORDS.end()) {
|
||||||
|
forbiddenCount++;
|
||||||
|
}
|
||||||
|
// if (std::ranges::find_if(BAD_WORDS, [&word](const std::wstring_view &badWord) {
|
||||||
|
// return word.contains(badWord);
|
||||||
|
// }
|
||||||
|
// ) != BAD_WORDS.end()) {
|
||||||
|
// forbiddenCount++;
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
void parseFile(const std::string_view &filename) {
|
||||||
|
std::wifstream file;
|
||||||
|
|
||||||
|
// surpress warning of deprecation
|
||||||
|
#pragma warning(push)
|
||||||
|
#pragma warning(suppress : 4996)
|
||||||
|
file.imbue(std::locale(std::locale(), new std::codecvt_utf8<wchar_t>));
|
||||||
|
#pragma warning(pop)
|
||||||
|
|
||||||
|
file.open(std::string(filename));
|
||||||
|
if (!file.is_open()) {
|
||||||
|
std::println("File doesn't exist: {}", filename);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool inWord = false;
|
||||||
|
bool isDigit = false;
|
||||||
|
wchar_t c;
|
||||||
|
|
||||||
|
std::wstring word;
|
||||||
|
while (file.get(c)) {
|
||||||
|
if (c == '.') {
|
||||||
|
totalSentenceCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (std::isspace(c)) {
|
||||||
|
inWord = false;
|
||||||
|
isDigit = false;
|
||||||
|
|
||||||
|
if (!word.empty()) {
|
||||||
|
check_word(word, totalForbiddenCount);
|
||||||
|
word.clear();
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
if (!inWord) {
|
||||||
|
totalWordCount++;
|
||||||
|
if (std::isupper(c)) {
|
||||||
|
totalCapitalizedCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inWord = true;
|
||||||
|
|
||||||
|
if (std::isdigit(c) && !isDigit) {
|
||||||
|
totalNumberCount++;
|
||||||
|
isDigit = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
word.push_back(c);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// std::cout << "File state: " << file.rdstate() << " EOF" << file.eof() << " Fail" << file.fail() << " Bad" << file.bad() << std::endl;
|
||||||
|
|
||||||
|
if (!word.empty()) {
|
||||||
|
check_word(word, totalForbiddenCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
file.close();
|
||||||
|
|
||||||
|
if (file.fail() && !file.eof()) {
|
||||||
|
failCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(const int argc, char *argv[]) {
|
||||||
|
if (argc < 2) {
|
||||||
|
std::println("Usage: {} <file1> <file2> ... <fileN>", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::for_each(std::execution::par_unseq, std::next(argv), argv + argc, parseFile);
|
||||||
|
|
||||||
|
double capitalizedPercentage = (totalWordCount > 0)
|
||||||
|
? static_cast<double>(totalCapitalizedCount) / totalWordCount * 100.0
|
||||||
|
: 0;
|
||||||
|
double forbiddenPercentage = (totalWordCount > 0)
|
||||||
|
? static_cast<double>(totalForbiddenCount) / totalWordCount * 100.0
|
||||||
|
: 0;
|
||||||
|
double wordCountPerSentence = (totalSentenceCount > 0)
|
||||||
|
? static_cast<double>(totalWordCount) / totalSentenceCount
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
std::println(
|
||||||
|
"Word Count: {}\nCapitalized Count: {}\nSentence Count: {}\nNumber Count: {}\nForbidden Count: {}\nFile Count: {}\nFail Count: {}\nCapitalized Percentage: {}%\nForbidden Percentage: {}%\nWord Count Per Sentence: {}",
|
||||||
|
totalWordCount, totalCapitalizedCount, totalSentenceCount, totalNumberCount, totalForbiddenCount, fileCount, failCount,
|
||||||
|
capitalizedPercentage, forbiddenPercentage, wordCountPerSentence
|
||||||
|
);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
576
borded_cpp/src/main3.cpp
Normal file
576
borded_cpp/src/main3.cpp
Normal file
@ -0,0 +1,576 @@
|
|||||||
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
|
#include <fstream>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <execution>
|
||||||
|
#include <format>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <aio.h>
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <sys/signal.h>
|
||||||
|
|
||||||
|
#ifdef __cpp_lib_print
|
||||||
|
#include <print>
|
||||||
|
#else
|
||||||
|
namespace std {
|
||||||
|
template <typename T, typename... Args>
|
||||||
|
inline void print(T format, Args &&... args) {
|
||||||
|
auto f = std::vformat(format, std::make_format_args(args...));
|
||||||
|
std::cout << f;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, typename... Args>
|
||||||
|
inline void println(T format, Args &&... args) {
|
||||||
|
auto f = std::vformat(format, std::make_format_args(args...));
|
||||||
|
std::cout << f << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
constexpr std::array<std::string_view, 35> BAD_WORDS = {
|
||||||
|
"recovery",
|
||||||
|
"techie",
|
||||||
|
"http",
|
||||||
|
"https",
|
||||||
|
"digital",
|
||||||
|
"hack",
|
||||||
|
"::",
|
||||||
|
"//",
|
||||||
|
"com",
|
||||||
|
"@",
|
||||||
|
"crypto",
|
||||||
|
"bitcoin",
|
||||||
|
"wallet",
|
||||||
|
"hacker",
|
||||||
|
"welcome",
|
||||||
|
"whatsapp",
|
||||||
|
"email",
|
||||||
|
"cryptocurrency",
|
||||||
|
"stolen",
|
||||||
|
"freeze",
|
||||||
|
"quick",
|
||||||
|
"crucial",
|
||||||
|
"tracing",
|
||||||
|
"scammers",
|
||||||
|
"expers",
|
||||||
|
"hire",
|
||||||
|
"century",
|
||||||
|
"transaction",
|
||||||
|
"essential",
|
||||||
|
"managing",
|
||||||
|
"contact",
|
||||||
|
"contacting",
|
||||||
|
"understanding",
|
||||||
|
"assets",
|
||||||
|
"funds",
|
||||||
|
};
|
||||||
|
const std::unordered_set<std::string_view> BAD_WORDS_SET(BAD_WORDS.begin(), BAD_WORDS.end());
|
||||||
|
static constexpr unsigned int crc_table[256] = {
|
||||||
|
0x00000000,
|
||||||
|
0x77073096,
|
||||||
|
0xee0e612c,
|
||||||
|
0x990951ba,
|
||||||
|
0x076dc419,
|
||||||
|
0x706af48f,
|
||||||
|
0xe963a535,
|
||||||
|
0x9e6495a3,
|
||||||
|
0x0edb8832,
|
||||||
|
0x79dcb8a4,
|
||||||
|
0xe0d5e91e,
|
||||||
|
0x97d2d988,
|
||||||
|
0x09b64c2b,
|
||||||
|
0x7eb17cbd,
|
||||||
|
0xe7b82d07,
|
||||||
|
0x90bf1d91,
|
||||||
|
0x1db71064,
|
||||||
|
0x6ab020f2,
|
||||||
|
0xf3b97148,
|
||||||
|
0x84be41de,
|
||||||
|
0x1adad47d,
|
||||||
|
0x6ddde4eb,
|
||||||
|
0xf4d4b551,
|
||||||
|
0x83d385c7,
|
||||||
|
0x136c9856,
|
||||||
|
0x646ba8c0,
|
||||||
|
0xfd62f97a,
|
||||||
|
0x8a65c9ec,
|
||||||
|
0x14015c4f,
|
||||||
|
0x63066cd9,
|
||||||
|
0xfa0f3d63,
|
||||||
|
0x8d080df5,
|
||||||
|
0x3b6e20c8,
|
||||||
|
0x4c69105e,
|
||||||
|
0xd56041e4,
|
||||||
|
0xa2677172,
|
||||||
|
0x3c03e4d1,
|
||||||
|
0x4b04d447,
|
||||||
|
0xd20d85fd,
|
||||||
|
0xa50ab56b,
|
||||||
|
0x35b5a8fa,
|
||||||
|
0x42b2986c,
|
||||||
|
0xdbbbc9d6,
|
||||||
|
0xacbcf940,
|
||||||
|
0x32d86ce3,
|
||||||
|
0x45df5c75,
|
||||||
|
0xdcd60dcf,
|
||||||
|
0xabd13d59,
|
||||||
|
0x26d930ac,
|
||||||
|
0x51de003a,
|
||||||
|
0xc8d75180,
|
||||||
|
0xbfd06116,
|
||||||
|
0x21b4f4b5,
|
||||||
|
0x56b3c423,
|
||||||
|
0xcfba9599,
|
||||||
|
0xb8bda50f,
|
||||||
|
0x2802b89e,
|
||||||
|
0x5f058808,
|
||||||
|
0xc60cd9b2,
|
||||||
|
0xb10be924,
|
||||||
|
0x2f6f7c87,
|
||||||
|
0x58684c11,
|
||||||
|
0xc1611dab,
|
||||||
|
0xb6662d3d,
|
||||||
|
0x76dc4190,
|
||||||
|
0x01db7106,
|
||||||
|
0x98d220bc,
|
||||||
|
0xefd5102a,
|
||||||
|
0x71b18589,
|
||||||
|
0x06b6b51f,
|
||||||
|
0x9fbfe4a5,
|
||||||
|
0xe8b8d433,
|
||||||
|
0x7807c9a2,
|
||||||
|
0x0f00f934,
|
||||||
|
0x9609a88e,
|
||||||
|
0xe10e9818,
|
||||||
|
0x7f6a0dbb,
|
||||||
|
0x086d3d2d,
|
||||||
|
0x91646c97,
|
||||||
|
0xe6635c01,
|
||||||
|
0x6b6b51f4,
|
||||||
|
0x1c6c6162,
|
||||||
|
0x856530d8,
|
||||||
|
0xf262004e,
|
||||||
|
0x6c0695ed,
|
||||||
|
0x1b01a57b,
|
||||||
|
0x8208f4c1,
|
||||||
|
0xf50fc457,
|
||||||
|
0x65b0d9c6,
|
||||||
|
0x12b7e950,
|
||||||
|
0x8bbeb8ea,
|
||||||
|
0xfcb9887c,
|
||||||
|
0x62dd1ddf,
|
||||||
|
0x15da2d49,
|
||||||
|
0x8cd37cf3,
|
||||||
|
0xfbd44c65,
|
||||||
|
0x4db26158,
|
||||||
|
0x3ab551ce,
|
||||||
|
0xa3bc0074,
|
||||||
|
0xd4bb30e2,
|
||||||
|
0x4adfa541,
|
||||||
|
0x3dd895d7,
|
||||||
|
0xa4d1c46d,
|
||||||
|
0xd3d6f4fb,
|
||||||
|
0x4369e96a,
|
||||||
|
0x346ed9fc,
|
||||||
|
0xad678846,
|
||||||
|
0xda60b8d0,
|
||||||
|
0x44042d73,
|
||||||
|
0x33031de5,
|
||||||
|
0xaa0a4c5f,
|
||||||
|
0xdd0d7cc9,
|
||||||
|
0x5005713c,
|
||||||
|
0x270241aa,
|
||||||
|
0xbe0b1010,
|
||||||
|
0xc90c2086,
|
||||||
|
0x5768b525,
|
||||||
|
0x206f85b3,
|
||||||
|
0xb966d409,
|
||||||
|
0xce61e49f,
|
||||||
|
0x5edef90e,
|
||||||
|
0x29d9c998,
|
||||||
|
0xb0d09822,
|
||||||
|
0xc7d7a8b4,
|
||||||
|
0x59b33d17,
|
||||||
|
0x2eb40d81,
|
||||||
|
0xb7bd5c3b,
|
||||||
|
0xc0ba6cad,
|
||||||
|
0xedb88320,
|
||||||
|
0x9abfb3b6,
|
||||||
|
0x03b6e20c,
|
||||||
|
0x74b1d29a,
|
||||||
|
0xead54739,
|
||||||
|
0x9dd277af,
|
||||||
|
0x04db2615,
|
||||||
|
0x73dc1683,
|
||||||
|
0xe3630b12,
|
||||||
|
0x94643b84,
|
||||||
|
0x0d6d6a3e,
|
||||||
|
0x7a6a5aa8,
|
||||||
|
0xe40ecf0b,
|
||||||
|
0x9309ff9d,
|
||||||
|
0x0a00ae27,
|
||||||
|
0x7d079eb1,
|
||||||
|
0xf00f9344,
|
||||||
|
0x8708a3d2,
|
||||||
|
0x1e01f268,
|
||||||
|
0x6906c2fe,
|
||||||
|
0xf762575d,
|
||||||
|
0x806567cb,
|
||||||
|
0x196c3671,
|
||||||
|
0x6e6b06e7,
|
||||||
|
0xfed41b76,
|
||||||
|
0x89d32be0,
|
||||||
|
0x10da7a5a,
|
||||||
|
0x67dd4acc,
|
||||||
|
0xf9b9df6f,
|
||||||
|
0x8ebeeff9,
|
||||||
|
0x17b7be43,
|
||||||
|
0x60b08ed5,
|
||||||
|
0xd6d6a3e8,
|
||||||
|
0xa1d1937e,
|
||||||
|
0x38d8c2c4,
|
||||||
|
0x4fdff252,
|
||||||
|
0xd1bb67f1,
|
||||||
|
0xa6bc5767,
|
||||||
|
0x3fb506dd,
|
||||||
|
0x48b2364b,
|
||||||
|
0xd80d2bda,
|
||||||
|
0xaf0a1b4c,
|
||||||
|
0x36034af6,
|
||||||
|
0x41047a60,
|
||||||
|
0xdf60efc3,
|
||||||
|
0xa867df55,
|
||||||
|
0x316e8eef,
|
||||||
|
0x4669be79,
|
||||||
|
0xcb61b38c,
|
||||||
|
0xbc66831a,
|
||||||
|
0x256fd2a0,
|
||||||
|
0x5268e236,
|
||||||
|
0xcc0c7795,
|
||||||
|
0xbb0b4703,
|
||||||
|
0x220216b9,
|
||||||
|
0x5505262f,
|
||||||
|
0xc5ba3bbe,
|
||||||
|
0xb2bd0b28,
|
||||||
|
0x2bb45a92,
|
||||||
|
0x5cb36a04,
|
||||||
|
0xc2d7ffa7,
|
||||||
|
0xb5d0cf31,
|
||||||
|
0x2cd99e8b,
|
||||||
|
0x5bdeae1d,
|
||||||
|
0x9b64c2b0,
|
||||||
|
0xec63f226,
|
||||||
|
0x756aa39c,
|
||||||
|
0x026d930a,
|
||||||
|
0x9c0906a9,
|
||||||
|
0xeb0e363f,
|
||||||
|
0x72076785,
|
||||||
|
0x05005713,
|
||||||
|
0x95bf4a82,
|
||||||
|
0xe2b87a14,
|
||||||
|
0x7bb12bae,
|
||||||
|
0x0cb61b38,
|
||||||
|
0x92d28e9b,
|
||||||
|
0xe5d5be0d,
|
||||||
|
0x7cdcefb7,
|
||||||
|
0x0bdbdf21,
|
||||||
|
0x86d3d2d4,
|
||||||
|
0xf1d4e242,
|
||||||
|
0x68ddb3f8,
|
||||||
|
0x1fda836e,
|
||||||
|
0x81be16cd,
|
||||||
|
0xf6b9265b,
|
||||||
|
0x6fb077e1,
|
||||||
|
0x18b74777,
|
||||||
|
0x88085ae6,
|
||||||
|
0xff0f6a70,
|
||||||
|
0x66063bca,
|
||||||
|
0x11010b5c,
|
||||||
|
0x8f659eff,
|
||||||
|
0xf862ae69,
|
||||||
|
0x616bffd3,
|
||||||
|
0x166ccf45,
|
||||||
|
0xa00ae278,
|
||||||
|
0xd70dd2ee,
|
||||||
|
0x4e048354,
|
||||||
|
0x3903b3c2,
|
||||||
|
0xa7672661,
|
||||||
|
0xd06016f7,
|
||||||
|
0x4969474d,
|
||||||
|
0x3e6e77db,
|
||||||
|
0xaed16a4a,
|
||||||
|
0xd9d65adc,
|
||||||
|
0x40df0b66,
|
||||||
|
0x37d83bf0,
|
||||||
|
0xa9bcae53,
|
||||||
|
0xdebb9ec5,
|
||||||
|
0x47b2cf7f,
|
||||||
|
0x30b5ffe9,
|
||||||
|
0xbdbdf21c,
|
||||||
|
0xcabac28a,
|
||||||
|
0x53b39330,
|
||||||
|
0x24b4a3a6,
|
||||||
|
0xbad03605,
|
||||||
|
0xcdd70693,
|
||||||
|
0x54de5729,
|
||||||
|
0x23d967bf,
|
||||||
|
0xb3667a2e,
|
||||||
|
0xc4614ab8,
|
||||||
|
0x5d681b02,
|
||||||
|
0x2a6f2b94,
|
||||||
|
0xb40bbe37,
|
||||||
|
0xc30c8ea1,
|
||||||
|
0x5a05df1b,
|
||||||
|
0x2d02ef8d
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr uint32_t crc32(std::string_view str) {
|
||||||
|
uint32_t crc = 0xffffffff;
|
||||||
|
for (auto c : str)
|
||||||
|
crc = (crc >> 8) ^ crc_table[(crc ^ c) & 0xff];
|
||||||
|
return crc ^ 0xffffffff;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr uint32_t crc32(char const *str, const size_t size) {
|
||||||
|
uint32_t crc = 0xffffffff;
|
||||||
|
for (size_t i = 0; i < size; ++i)
|
||||||
|
crc = (crc >> 8) ^ crc_table[(crc ^ str[i]) & 0xff];
|
||||||
|
return crc ^ 0xffffffff;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr std::array<uint32_t, 35> BAD_WORDS_HASH = {
|
||||||
|
crc32("recovery"),
|
||||||
|
crc32("techie"),
|
||||||
|
crc32("http"),
|
||||||
|
crc32("https"),
|
||||||
|
crc32("digital"),
|
||||||
|
crc32("hack"),
|
||||||
|
crc32("::"),
|
||||||
|
crc32("//"),
|
||||||
|
crc32("com"),
|
||||||
|
crc32("@"),
|
||||||
|
crc32("crypto"),
|
||||||
|
crc32("bitcoin"),
|
||||||
|
crc32("wallet"),
|
||||||
|
crc32("hacker"),
|
||||||
|
crc32("welcome"),
|
||||||
|
crc32("whatsapp"),
|
||||||
|
crc32("email"),
|
||||||
|
crc32("cryptocurrency"),
|
||||||
|
crc32("stolen"),
|
||||||
|
crc32("freeze"),
|
||||||
|
crc32("quick"),
|
||||||
|
crc32("crucial"),
|
||||||
|
crc32("tracing"),
|
||||||
|
crc32("scammers"),
|
||||||
|
crc32("expers"),
|
||||||
|
crc32("hire"),
|
||||||
|
crc32("century"),
|
||||||
|
crc32("transaction"),
|
||||||
|
crc32("essential"),
|
||||||
|
crc32("managing"),
|
||||||
|
crc32("contact"),
|
||||||
|
crc32("contacting"),
|
||||||
|
crc32("understanding"),
|
||||||
|
crc32("assets"),
|
||||||
|
crc32("funds")
|
||||||
|
};
|
||||||
|
const std::unordered_set BAD_WORDS_STR(BAD_WORDS.begin(), BAD_WORDS.end());
|
||||||
|
|
||||||
|
constexpr auto SHORTEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::max(),
|
||||||
|
[](std::size_t current, const std::string_view &word) {
|
||||||
|
return std::min(current, word.size());
|
||||||
|
}
|
||||||
|
);
|
||||||
|
constexpr auto LONGEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::min(),
|
||||||
|
[](std::size_t current, const std::string_view &word) {
|
||||||
|
return std::max(current, word.size());
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
int totalWordCount = 0;
|
||||||
|
int totalCapitalizedCount = 0;
|
||||||
|
int totalSentenceCount = 0;
|
||||||
|
int totalNumberCount = 0;
|
||||||
|
int totalForbiddenCount = 0;
|
||||||
|
int fileCount = 1;
|
||||||
|
|
||||||
|
int failCount = 0;
|
||||||
|
|
||||||
|
|
||||||
|
int done = 0;
|
||||||
|
|
||||||
|
|
||||||
|
struct info {
|
||||||
|
std::string_view name;
|
||||||
|
aiocb *cb;
|
||||||
|
const std::chrono::time_point<std::chrono::steady_clock> start = std::chrono::steady_clock::now();
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr void check_word_simple(const char *word, const ssize_t size) {
|
||||||
|
if (size < SHORTEST_BAD_WORD || size > LONGEST_BAD_WORD) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// if (BAD_WORDS_SET.contains(word)) {
|
||||||
|
// totalForbiddenCount++;
|
||||||
|
// }
|
||||||
|
|
||||||
|
const auto hs = crc32(word, size);
|
||||||
|
|
||||||
|
for (int i = 0; i < BAD_WORDS_HASH.size(); ++i) {
|
||||||
|
if (BAD_WORDS_HASH[i] == hs) {
|
||||||
|
totalForbiddenCount++;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void read_str(char *str, ssize_t size) {
|
||||||
|
int mark = -1;
|
||||||
|
|
||||||
|
int fileWords = 0;
|
||||||
|
for (int pos = 0; pos <= size; ++pos) {
|
||||||
|
char *c = str + pos;
|
||||||
|
|
||||||
|
if (*c == '.') {
|
||||||
|
totalSentenceCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t') {
|
||||||
|
if (mark != -1) {
|
||||||
|
check_word_simple(str + mark, pos - mark);
|
||||||
|
mark = -1;
|
||||||
|
}
|
||||||
|
} else if (mark == -1) {
|
||||||
|
++fileWords;
|
||||||
|
if (*c >= 'A' && *c <= 'Z') {
|
||||||
|
totalCapitalizedCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
mark = pos;
|
||||||
|
} else if (*c >= '0' && *c <= '9') {
|
||||||
|
totalNumberCount++;
|
||||||
|
for (; pos <= size; ++pos) {
|
||||||
|
c = str + pos;
|
||||||
|
if (*c == '.') {
|
||||||
|
totalSentenceCount++;
|
||||||
|
}
|
||||||
|
if (*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mark = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mark != -1) {
|
||||||
|
check_word_simple(str + mark, size - mark);
|
||||||
|
}
|
||||||
|
|
||||||
|
totalWordCount += fileWords;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void aio_completion_handler(sigval_t sigval) {
|
||||||
|
|
||||||
|
fileCount++;
|
||||||
|
info *data = (info *)sigval.sival_ptr;
|
||||||
|
auto req = data->cb;
|
||||||
|
// auto req = (struct aiocb *)sigval.sival_ptr;
|
||||||
|
/* Did the request complete? */
|
||||||
|
auto error = aio_error(req);
|
||||||
|
if (error == 0) {
|
||||||
|
|
||||||
|
/* Request completed successfully, get the return status */
|
||||||
|
// const auto start{std::chrono::steady_clock::now()};
|
||||||
|
// const std::chrono::duration<double> start_seconds{start - (data->start)};
|
||||||
|
// std::println("File started {} in {}", data->name, start_seconds.count());
|
||||||
|
read_str((char *)req->aio_buf, aio_return(req));
|
||||||
|
// const auto finish{std::chrono::steady_clock::now()};
|
||||||
|
// const std::chrono::duration<double> elapsed_seconds{finish - (data->start)};
|
||||||
|
// std::println("File read {} in {}", data->name, elapsed_seconds.count());
|
||||||
|
} else {
|
||||||
|
std::println("Error at aio_error ({}): ", error);
|
||||||
|
failCount++;
|
||||||
|
}
|
||||||
|
--done;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(const int argc, char *argv[]) {
|
||||||
|
if (argc < 2) {
|
||||||
|
std::println("Usage: {} <file1> <file2> ... <fileN>", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
done = argc - 1;
|
||||||
|
|
||||||
|
// lio_listio
|
||||||
|
auto aiocb_list = (struct aiocb *)malloc(sizeof(struct aiocb) * (argc - 1));
|
||||||
|
auto aiocb_list_ptr = (struct aiocb **)malloc(sizeof(struct aiocb *) * (argc - 1));
|
||||||
|
|
||||||
|
// char *memchnk = (char *)malloc(5 * 1024 * 1024 * (argc - 1));
|
||||||
|
|
||||||
|
for (std::size_t i = 0; i < argc - 1; i++) {
|
||||||
|
aiocb_list[i].aio_fildes = open(argv[i + 1], O_RDONLY);
|
||||||
|
aiocb_list[i].aio_offset = 0;
|
||||||
|
// 5mb
|
||||||
|
aiocb_list[i].aio_buf = malloc(5 * 1024 * 1024);
|
||||||
|
aiocb_list[i].aio_nbytes = (5 * 1024 * 1024);;
|
||||||
|
|
||||||
|
aiocb_list[i].aio_sigevent.sigev_notify = SIGEV_THREAD;
|
||||||
|
aiocb_list[i].aio_sigevent.sigev_notify_function = aio_completion_handler;
|
||||||
|
aiocb_list[i].aio_sigevent.sigev_notify_attributes = nullptr;
|
||||||
|
// aiocb_list[i].aio_sigevent.sigev_value.sival_ptr = &aiocb_list[i];
|
||||||
|
aiocb_list[i].aio_sigevent.sigev_value.sival_ptr = new info{
|
||||||
|
argv[i + 1],
|
||||||
|
&aiocb_list[i]};
|
||||||
|
|
||||||
|
// aiocb_list[i].aio_reqprio = SIGRTMIN;
|
||||||
|
|
||||||
|
aiocb_list_ptr[i] = &aiocb_list[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
lio_listio(LIO_WAIT, aiocb_list_ptr, argc - 1, nullptr);
|
||||||
|
|
||||||
|
while (done > 0) {
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::println("Done reading files, {} done", done);
|
||||||
|
|
||||||
|
double capitalizedPercentage = (totalWordCount > 0)
|
||||||
|
? static_cast<double>(totalCapitalizedCount) / totalWordCount * 100.0
|
||||||
|
: 0;
|
||||||
|
double forbiddenPercentage = (totalWordCount > 0)
|
||||||
|
? static_cast<double>(totalForbiddenCount) / totalWordCount * 100.0
|
||||||
|
: 0;
|
||||||
|
double wordCountPerSentence = (totalSentenceCount > 0)
|
||||||
|
? static_cast<double>(totalWordCount) / totalSentenceCount
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
std::println(
|
||||||
|
"Word Count: {}\nCapitalized Count: {}\nSentence Count: {}\nNumber Count: {}\nForbidden Count: {}\nFile Count: {}\nFail Count: {}\nCapitalized Percentage: {}%\nForbidden Percentage: {}%\nWord Count Per Sentence: {}",
|
||||||
|
totalWordCount, totalCapitalizedCount, totalSentenceCount, totalNumberCount, totalForbiddenCount, fileCount, failCount,
|
||||||
|
capitalizedPercentage, forbiddenPercentage, wordCountPerSentence
|
||||||
|
);
|
||||||
|
|
||||||
|
for (std::size_t i = 0; i < argc - 1; i++) {
|
||||||
|
close(aiocb_list[i].aio_fildes);
|
||||||
|
free((void *)aiocb_list[i].aio_buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(aiocb_list);
|
||||||
|
free(aiocb_list_ptr);
|
||||||
|
// free(memchnk);
|
||||||
|
|
||||||
|
if (failCount > 0) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
3
jest_rust/.gitignore
vendored
Normal file
3
jest_rust/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
/target
|
||||||
|
/Cargo.lock
|
||||||
|
/test_books
|
||||||
15
jest_rust/Cargo.toml
Normal file
15
jest_rust/Cargo.toml
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
[package]
|
||||||
|
name = "jisspam"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
fxhash = "0.2.1"
|
||||||
|
tokio = { version = "1.44.1", features = ["full"] }
|
||||||
|
|
||||||
|
[profile.release]
|
||||||
|
codegen-units = 1 # less means more compile work but better optimized
|
||||||
|
lto = "fat" # thin has best performance. fat the worst
|
||||||
|
strip = true
|
||||||
|
# opt-level = "z" # slows down
|
||||||
|
panic = "abort"
|
||||||
84
jest_rust/README.md
Normal file
84
jest_rust/README.md
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
for https://retoor.molodetz.nl/retoor/isspam
|
||||||
|
|
||||||
|
extract `../books.tar.gz`
|
||||||
|
|
||||||
|
# local machine benchmarks
|
||||||
|
|
||||||
|
single threaded: `33.63373279571533`
|
||||||
|
|
||||||
|
rayon: `4.294418811798096`
|
||||||
|
|
||||||
|
tokio: `4.717588901519775`
|
||||||
|
|
||||||
|
tokio:
|
||||||
|
|
||||||
|
muncher: `2486ms`
|
||||||
|
|
||||||
|
for_loops: `1227ms`
|
||||||
|
|
||||||
|
for_loops_forbidden_only: `987ms`
|
||||||
|
|
||||||
|
trie creation and stats accumulation take 0ms
|
||||||
|
|
||||||
|
FxHashMap faster than BTreeMap
|
||||||
|
|
||||||
|
## compile options benchmarks
|
||||||
|
`lto` thin, fat doesn't change much
|
||||||
|
|
||||||
|
`codegen-units` 0, 1 doesn't change much
|
||||||
|
|
||||||
|
`opt-level = "z"` slow things down
|
||||||
|
|
||||||
|
# ubuntu terminal running
|
||||||
|
https://snek.molodetz.nl/terminal.html ubuntu running thing instructions:
|
||||||
|
```
|
||||||
|
mkdir /project
|
||||||
|
cd /project
|
||||||
|
git clone https://retoor.molodetz.nl/retoor/isspam.git
|
||||||
|
apt install valgrind curl
|
||||||
|
export RUSTUP_HOME=/project/.rustup
|
||||||
|
export CARGO_HOME=/project/.cargo
|
||||||
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||||
|
. "/project/.cargo/env"
|
||||||
|
cd isspam
|
||||||
|
rustup install nightly
|
||||||
|
rustup default nightly
|
||||||
|
make
|
||||||
|
make benchmark
|
||||||
|
python3 bench.py
|
||||||
|
```
|
||||||
|
|
||||||
|
clone: `git clone https://gitlab.com/jestdotty-group/draft/jisspam.git jest_rust`
|
||||||
|
|
||||||
|
edit make: `vi makefile` and add build:
|
||||||
|
```
|
||||||
|
build_jest:
|
||||||
|
@echo "compiling jest_rust project"
|
||||||
|
cd jest_rust && cargo build --release && cp target/release/jisspam ..
|
||||||
|
```
|
||||||
|
append to all script:
|
||||||
|
```
|
||||||
|
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
|
||||||
|
```
|
||||||
|
|
||||||
|
add to bench: `vi bench.py`
|
||||||
|
```py
|
||||||
|
time_start = time.time()
|
||||||
|
subprocess.check_output('./jisspam books/*.txt', shell=True)
|
||||||
|
print("Time Jest Rust:", time.time() - time_start)
|
||||||
|
```
|
||||||
|
|
||||||
|
run: `python3 bench.py`
|
||||||
|
output looks something like this:
|
||||||
|
```
|
||||||
|
***benchmarking***
|
||||||
|
Time C: 31.315868377685547
|
||||||
|
Time Rust: 41.232205867767334
|
||||||
|
Time CPP: 20.1683189868927
|
||||||
|
Time Borded CPP: 15.468477964401245
|
||||||
|
Time Jest Rust: 54.74523115158081
|
||||||
|
Time Retoor Python: 287.63036131858826
|
||||||
|
***end benchmark***
|
||||||
|
```
|
||||||
|
|
||||||
|
add `/jisspam` to `.gitignore` to not commit the executable accidentally
|
||||||
122
jest_rust/src/main.rs
Normal file
122
jest_rust/src/main.rs
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
mod parser;
|
||||||
|
mod stats;
|
||||||
|
mod trie;
|
||||||
|
|
||||||
|
use stats::Stats;
|
||||||
|
use std::{env, fs, sync::LazyLock};
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
use trie::Trie;
|
||||||
|
|
||||||
|
static FORBIDDEN_WORDS: LazyLock<Trie> = LazyLock::new(|| {
|
||||||
|
let mut trie = Trie::default();
|
||||||
|
for word in [
|
||||||
|
"recovery",
|
||||||
|
"techie",
|
||||||
|
"http",
|
||||||
|
"https",
|
||||||
|
"digital",
|
||||||
|
"hack",
|
||||||
|
"::",
|
||||||
|
"//",
|
||||||
|
"@",
|
||||||
|
"com",
|
||||||
|
"crypto",
|
||||||
|
"bitcoin",
|
||||||
|
"wallet",
|
||||||
|
"hacker",
|
||||||
|
"welcome",
|
||||||
|
"whatsapp",
|
||||||
|
"email",
|
||||||
|
"cryptocurrency",
|
||||||
|
"stolen",
|
||||||
|
"freeze",
|
||||||
|
"quick",
|
||||||
|
"crucial",
|
||||||
|
"tracing",
|
||||||
|
"scammers",
|
||||||
|
"expers",
|
||||||
|
"hire",
|
||||||
|
"century",
|
||||||
|
"transaction",
|
||||||
|
"essential",
|
||||||
|
"managing",
|
||||||
|
"contact",
|
||||||
|
"contacting",
|
||||||
|
"understanding",
|
||||||
|
"assets",
|
||||||
|
"funds",
|
||||||
|
] {
|
||||||
|
trie.insert(word);
|
||||||
|
}
|
||||||
|
trie
|
||||||
|
});
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let files = env::args().skip(1);
|
||||||
|
let mut rx = {
|
||||||
|
let (tx, rx) = mpsc::unbounded_channel();
|
||||||
|
for file in files {
|
||||||
|
let tx = tx.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let mut stats = Stats::default();
|
||||||
|
//reading files in threads doesn't change speed of any sort but oh well
|
||||||
|
if let Ok(text) = fs::read_to_string(&file) {
|
||||||
|
stats.file_count += 1;
|
||||||
|
parser::for_loops::parse(&mut stats, &text);
|
||||||
|
} else {
|
||||||
|
stats.failed_file_count += 1;
|
||||||
|
}
|
||||||
|
let _ = tx.send(stats);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
rx
|
||||||
|
};
|
||||||
|
let mut stats = Stats::default();
|
||||||
|
while let Some(file_stat) = rx.recv().await {
|
||||||
|
stats += file_stat;
|
||||||
|
}
|
||||||
|
println!("{stats}");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// needs ../books.tar.gz to be extracted into ../books
|
||||||
|
#[test]
|
||||||
|
fn test() {
|
||||||
|
use std::{env, fs, process::Command, time::Instant};
|
||||||
|
println!("cwd: {}", env::current_dir().unwrap().display());
|
||||||
|
|
||||||
|
//compile
|
||||||
|
let mut compile = Command::new("cargo");
|
||||||
|
let compile_arged = compile.arg("build").arg("--release");
|
||||||
|
match compile_arged.output() {
|
||||||
|
Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)),
|
||||||
|
Err(err) => eprintln!("compile failed: {err}"),
|
||||||
|
}
|
||||||
|
|
||||||
|
//get test files
|
||||||
|
let files = fs::read_dir("../books")
|
||||||
|
.unwrap()
|
||||||
|
.map(|f| {
|
||||||
|
f.unwrap()
|
||||||
|
.path()
|
||||||
|
.canonicalize()
|
||||||
|
.unwrap()
|
||||||
|
.to_str()
|
||||||
|
.unwrap()
|
||||||
|
.to_string()
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
println!("test files found: {}", files.len());
|
||||||
|
|
||||||
|
println!();
|
||||||
|
|
||||||
|
//benchmark run
|
||||||
|
let benchmark = Instant::now();
|
||||||
|
let mut run = Command::new("target/release/jisspam");
|
||||||
|
let run_arged = run.args(files);
|
||||||
|
match run_arged.output() {
|
||||||
|
Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)),
|
||||||
|
Err(err) => eprintln!("run failed: {err}"),
|
||||||
|
}
|
||||||
|
println!("benchmark: {}ms", benchmark.elapsed().as_millis());
|
||||||
|
}
|
||||||
37
jest_rust/src/parser/for_loops.rs
Normal file
37
jest_rust/src/parser/for_loops.rs
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
use crate::{FORBIDDEN_WORDS, stats::Stats};
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
/// typically 5000ms
|
||||||
|
/// with trie this is 1600ms
|
||||||
|
pub fn parse(stats: &mut Stats, text: &str) {
|
||||||
|
for sentence in text
|
||||||
|
.split('.')
|
||||||
|
.map(|s| s.trim())
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
{
|
||||||
|
stats.sentence_count += 1;
|
||||||
|
for word in sentence
|
||||||
|
.split_ascii_whitespace()
|
||||||
|
.map(|s| s.trim())
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
{
|
||||||
|
stats.word_count += 1;
|
||||||
|
//get all numbers counted
|
||||||
|
let mut all_capitalized = true;
|
||||||
|
for char in word.chars() {
|
||||||
|
if char.is_numeric() {
|
||||||
|
stats.numeric_count += 1;
|
||||||
|
all_capitalized = false;
|
||||||
|
} else if !char.is_ascii_uppercase() {
|
||||||
|
all_capitalized = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if all_capitalized {
|
||||||
|
stats.capitalized_count += 1;
|
||||||
|
}
|
||||||
|
if FORBIDDEN_WORDS.contains(&word.to_lowercase()) {
|
||||||
|
stats.forbidden_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
14
jest_rust/src/parser/for_loops_forbidden_only.rs
Normal file
14
jest_rust/src/parser/for_loops_forbidden_only.rs
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
use crate::{FORBIDDEN_WORDS, stats::Stats};
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn parse(stats: &mut Stats, text: &str) {
|
||||||
|
for word in text
|
||||||
|
.split_ascii_whitespace()
|
||||||
|
.map(|s| s.trim())
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
{
|
||||||
|
if FORBIDDEN_WORDS.contains(&word.to_lowercase()) {
|
||||||
|
stats.forbidden_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
3
jest_rust/src/parser/mod.rs
Normal file
3
jest_rust/src/parser/mod.rs
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
pub mod for_loops;
|
||||||
|
pub mod for_loops_forbidden_only;
|
||||||
|
pub mod muncher;
|
||||||
66
jest_rust/src/parser/muncher.rs
Normal file
66
jest_rust/src/parser/muncher.rs
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
use crate::{FORBIDDEN_WORDS, stats::Stats};
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
|
||||||
|
/// 500ms is without forbidden words check, but...
|
||||||
|
/// 6000ms if adding forbidden words.. so not faster
|
||||||
|
/// with trie this is 2600ms
|
||||||
|
pub fn parse(stats: &mut Stats, text: &str) {
|
||||||
|
let mut capitalized = true;
|
||||||
|
let mut whitespaced = false;
|
||||||
|
let mut dotted = false;
|
||||||
|
let mut word = String::new();
|
||||||
|
for char in text.chars() {
|
||||||
|
if whitespaced {
|
||||||
|
if !char.is_whitespace() {
|
||||||
|
whitespaced = false; //end whiteness
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else if char.is_whitespace() {
|
||||||
|
whitespaced = true;
|
||||||
|
stats.word_count += 1; //end of word
|
||||||
|
if capitalized {
|
||||||
|
stats.capitalized_count += 1;
|
||||||
|
} else {
|
||||||
|
//reset capitalized word
|
||||||
|
capitalized = true;
|
||||||
|
}
|
||||||
|
let lowercase_word = word.to_lowercase();
|
||||||
|
if FORBIDDEN_WORDS.contains(&lowercase_word) {
|
||||||
|
stats.forbidden_count += 1;
|
||||||
|
}
|
||||||
|
word = String::new();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if dotted {
|
||||||
|
if char != '.' {
|
||||||
|
dotted = false; //end sentencing
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else if char == '.' {
|
||||||
|
dotted = true;
|
||||||
|
stats.sentence_count += 1;
|
||||||
|
stats.word_count += 1; //end of word
|
||||||
|
if capitalized {
|
||||||
|
stats.capitalized_count += 1;
|
||||||
|
} else {
|
||||||
|
//reset capitalized word
|
||||||
|
capitalized = true;
|
||||||
|
}
|
||||||
|
let lowercase_word = word.to_lowercase();
|
||||||
|
if FORBIDDEN_WORDS.contains(&lowercase_word) {
|
||||||
|
stats.forbidden_count += 1;
|
||||||
|
}
|
||||||
|
word = String::new();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
word += &char.to_string();
|
||||||
|
if char.is_numeric() {
|
||||||
|
stats.numeric_count += 1;
|
||||||
|
capitalized = false;
|
||||||
|
}
|
||||||
|
if !char.is_ascii_uppercase() {
|
||||||
|
capitalized = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
58
jest_rust/src/stats.rs
Normal file
58
jest_rust/src/stats.rs
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
use std::{fmt::Display, ops::AddAssign};
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct Stats {
|
||||||
|
pub file_count: u32,
|
||||||
|
pub failed_file_count: u32,
|
||||||
|
|
||||||
|
pub sentence_count: u32,
|
||||||
|
pub word_count: u32,
|
||||||
|
|
||||||
|
pub capitalized_count: u32,
|
||||||
|
pub numeric_count: u32,
|
||||||
|
pub forbidden_count: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AddAssign for Stats {
|
||||||
|
fn add_assign(&mut self, rhs: Self) {
|
||||||
|
self.file_count += rhs.file_count;
|
||||||
|
self.failed_file_count += rhs.failed_file_count;
|
||||||
|
|
||||||
|
self.sentence_count += rhs.sentence_count;
|
||||||
|
self.word_count += rhs.word_count;
|
||||||
|
|
||||||
|
self.capitalized_count += rhs.capitalized_count;
|
||||||
|
self.numeric_count += rhs.numeric_count;
|
||||||
|
self.forbidden_count += rhs.forbidden_count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl Display for Stats {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
writeln!(f, "file count: {}", self.file_count)?;
|
||||||
|
writeln!(f, "failed file count: {}", self.failed_file_count)?;
|
||||||
|
|
||||||
|
writeln!(f, "sentence count: {}", self.sentence_count)?;
|
||||||
|
writeln!(f, "word count: {}", self.word_count)?;
|
||||||
|
|
||||||
|
writeln!(f, "capitalized count: {}", self.capitalized_count)?;
|
||||||
|
writeln!(f, "numeric count: {}", self.numeric_count)?;
|
||||||
|
writeln!(f, "forbidden count: {}", self.forbidden_count)?;
|
||||||
|
|
||||||
|
let word_count = self.word_count as f32;
|
||||||
|
writeln!(
|
||||||
|
f,
|
||||||
|
"words per sentence average: {:.1}",
|
||||||
|
word_count / self.sentence_count as f32
|
||||||
|
)?;
|
||||||
|
writeln!(
|
||||||
|
f,
|
||||||
|
"forbidden word percentage: {:.2}%",
|
||||||
|
(self.forbidden_count as f32 / word_count) * 100.0,
|
||||||
|
)?;
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"capitalized word percentage: {:.2}%",
|
||||||
|
(self.capitalized_count as f32 / word_count) * 100.0,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
33
jest_rust/src/trie.rs
Normal file
33
jest_rust/src/trie.rs
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
use fxhash::FxBuildHasher;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
type FxHashMap<K, V> = HashMap<K, V, FxBuildHasher>; //simpler, slightly faster
|
||||||
|
|
||||||
|
#[derive(Default, Debug, Clone)]
|
||||||
|
struct Node {
|
||||||
|
end: bool,
|
||||||
|
children: FxHashMap<char, Node>,
|
||||||
|
}
|
||||||
|
#[derive(Default, Debug, Clone)]
|
||||||
|
pub struct Trie {
|
||||||
|
root: Node,
|
||||||
|
}
|
||||||
|
impl Trie {
|
||||||
|
pub fn insert(&mut self, word: &str) {
|
||||||
|
let mut node = &mut self.root;
|
||||||
|
for char in word.chars() {
|
||||||
|
node = node.children.entry(char).or_default();
|
||||||
|
}
|
||||||
|
node.end = true;
|
||||||
|
}
|
||||||
|
pub fn contains(&self, word: &str) -> bool {
|
||||||
|
let mut current_node = &self.root;
|
||||||
|
for char in word.chars() {
|
||||||
|
match current_node.children.get(&char) {
|
||||||
|
Some(node) => current_node = node,
|
||||||
|
None => return false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
current_node.end
|
||||||
|
}
|
||||||
|
}
|
||||||
85
retoor_c/isspam.py
Normal file
85
retoor_c/isspam.py
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
MAX_TEXT_LENGTH = 1024
|
||||||
|
FORBIDDEN_WORDS_COUNT = 40
|
||||||
|
|
||||||
|
forbidden_words = set([
|
||||||
|
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
|
||||||
|
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
|
||||||
|
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
|
||||||
|
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds",
|
||||||
|
])
|
||||||
|
|
||||||
|
class AnalysisResult:
|
||||||
|
def __init__(self, filename):
|
||||||
|
self.filename = filename
|
||||||
|
self.total_word_count = 0
|
||||||
|
self.total_capitalized_count = 0
|
||||||
|
self.total_sentence_count = 0
|
||||||
|
self.total_number_count = 0
|
||||||
|
self.total_forbidden_count = 0
|
||||||
|
|
||||||
|
def is_forbidden(word):
|
||||||
|
return word in forbidden_words
|
||||||
|
|
||||||
|
def read_file(filename):
|
||||||
|
if not os.path.exists(filename):
|
||||||
|
print(f"File doesn't exist: {filename}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
with open(filename, 'r') as file:
|
||||||
|
return file.read()
|
||||||
|
|
||||||
|
def analyze_file(result):
|
||||||
|
text = read_file(result.filename)
|
||||||
|
if text:
|
||||||
|
result.total_sentence_count = text.count('.')
|
||||||
|
tokens = text.split()
|
||||||
|
|
||||||
|
result.total_word_count = len(tokens)
|
||||||
|
result.total_capitalized_count = sum(1 for token in tokens if token[0].isupper())
|
||||||
|
result.total_number_count = sum(1 for token in tokens if any(char.isdigit() for char in token))
|
||||||
|
result.total_forbidden_count = sum(1 for token in tokens if is_forbidden(token))
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print(f"Usage: {sys.argv[0]} <file1> <file2> ... <fileN>")
|
||||||
|
return
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
futures = []
|
||||||
|
for filename in sys.argv[1:]:
|
||||||
|
result = AnalysisResult(filename)
|
||||||
|
results.append(result)
|
||||||
|
futures.append(executor.submit(analyze_file, result))
|
||||||
|
|
||||||
|
for future in futures:
|
||||||
|
future.result()
|
||||||
|
|
||||||
|
total_word_count = sum(result.total_word_count for result in results)
|
||||||
|
total_capitalized_count = sum(result.total_capitalized_count for result in results)
|
||||||
|
total_sentence_count = sum(result.total_sentence_count for result in results)
|
||||||
|
total_number_count = sum(result.total_number_count for result in results)
|
||||||
|
total_forbidden_count = sum(result.total_forbidden_count for result in results)
|
||||||
|
|
||||||
|
capitalized_percentage = (total_word_count > 0) * (total_capitalized_count / total_word_count * 100.0)
|
||||||
|
forbidden_percentage = (total_word_count > 0) * (total_forbidden_count / total_word_count * 100.0)
|
||||||
|
word_count_per_sentence = (total_sentence_count > 0) * (total_word_count / total_sentence_count)
|
||||||
|
|
||||||
|
print(f"\nTotal Words: {total_word_count}")
|
||||||
|
print(f"Total Capitalized words: {total_capitalized_count}")
|
||||||
|
print(f"Total Sentences: {total_sentence_count}")
|
||||||
|
print(f"Total Numbers: {total_number_count}")
|
||||||
|
print(f"Total Forbidden words: {total_forbidden_count}")
|
||||||
|
print(f"Capitalized percentage: {capitalized_percentage:.6f}%")
|
||||||
|
print(f"Forbidden percentage: {forbidden_percentage:.6f}%")
|
||||||
|
print(f"Word count per sentence: {word_count_per_sentence:.6f}")
|
||||||
|
print(f"Total files read: {len(sys.argv) - 1}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user