diff --git a/.gitignore b/.gitignore index c59232f..d853c60 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ __pycache__ target isspam risspam +isspam_cpp .build-trigger-2014-12-02 15:26 diff --git a/Makefile b/Makefile index 19836fa..3a111c7 100644 --- a/Makefile +++ b/Makefile @@ -1,20 +1,20 @@ CC = gcc CFLAGS = -Ofast -all: build run valgrind build_risspam run_risspam +all: build run valgrind build_risspam run_risspam build_cpp build: @echo "Compiling retoor_c project.". - @# removed -pedantic flag because it doesn't accept ' for formatting numbers - @# using printf @$(CC) $(CFLAGS) retoor_c/isspam.c -o isspam +build_cpp: + @echo "Compiling C++ version of isspam." + @g++ -Ofast retoor_c/isspam.cpp -o isspam_cpp build_risspam: @echo "Compiling 12bitfloat_risspam project." cd 12bitfloat_rust/risspam && cargo run --release && cp target/release/risspam ../../ - run: run_spam wl run_not_spam run_risspam: run_spam_risspam run_not_spam_risspam @@ -36,7 +36,6 @@ run_spam_risspam: run_not_spam_risspam: @./risspam ./not_spam/*.txt - valgrind: build valgrind ./isspam ./spam/*.txt diff --git a/bench.py b/bench.py index 9aa7eb2..0385e0c 100644 --- a/bench.py +++ b/bench.py @@ -8,4 +8,7 @@ print("Time C:",time.time() - time_start) time_start = time.time() subprocess.check_output('./risspam -p books/*.txt', shell=True) print("Time Rust:",time.time() - time_start) +time_start = time.time() +subprocess.check_output('./isspam_cpp books/*.txt', shell=True) +print("Time CPP:",time.time() - time_start) print("***end benchmark***") diff --git a/retoor_c/isspam.c b/retoor_c/isspam.c index 62c781d..064ac14 100644 --- a/retoor_c/isspam.c +++ b/retoor_c/isspam.c @@ -98,7 +98,7 @@ void* analyze_file(void* arg) { } char *saveptr; - char* token = strtok_r(text, " .?!;:\n", &saveptr); + char* token = strtok_r(text, " \f\v\r\n\t", &saveptr); while (token != NULL) { word_count++; @@ -117,7 +117,7 @@ void* analyze_file(void* arg) { forbidden_count++; } - token = strtok_r(NULL, " .?!;:\n", &saveptr); + token = strtok_r(NULL, " \f\v\r\n\t", &saveptr); } result->total_word_count = word_count; @@ -180,4 +180,4 @@ int main(int argc, char *argv[]) { printf("Word count per sentence: %.6f\n", word_count_per_sentence); printf("Total files read: %d\n", (int)(argc - 1)); return 0; -} \ No newline at end of file +} diff --git a/retoor_c/isspam.cpp b/retoor_c/isspam.cpp new file mode 100644 index 0000000..ebe0370 --- /dev/null +++ b/retoor_c/isspam.cpp @@ -0,0 +1,129 @@ +// Author: retoor@molodetz.nl + +#include <iostream> +#include <fstream> +#include <string> +#include <vector> +#include <thread> +#include <unordered_set> +#include <algorithm> +#include <sstream> + +#define FORBIDDEN_WORDS_COUNT 40 + +const std::unordered_set<std::string> forbidden_words = { + "recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com", + "@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency", + "stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century", + "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", +}; + +struct AnalysisResult { + std::string filename; + long long total_word_count = 0; + long long total_capitalized_count = 0; + long long total_sentence_count = 0; + long long total_number_count = 0; + long long total_forbidden_count = 0; +}; + +std::string read_file(const std::string& filename) { + std::ifstream file(filename); + if (!file) { + std::cerr << "File doesn't exist: " << filename << std::endl; + return ""; + } + + std::ostringstream content; + content << file.rdbuf(); // Read the entire file into a string + return content.str(); +} + +void analyze_file(AnalysisResult& result) { + std::string text = read_file(result.filename); + if (!text.empty()) { + long long word_count = 0; + long long capitalized_count = 0; + long long sentence_count = 0; + long long number_count = 0; + long long forbidden_count = 0; + + for (char c : text) { + if (c == '.') { + sentence_count++; + } + } + + std::istringstream stream(text); + std::string token; + while (stream >> token) { + word_count++; + + if (std::isupper(token[0])) { + capitalized_count++; + } + + if (std::any_of(token.begin(), token.end(), ::isdigit)) { + number_count++; + } + + if (forbidden_words.find(token) != forbidden_words.end()) { + forbidden_count++; + } + } + + result.total_word_count = word_count; + result.total_capitalized_count = capitalized_count; + result.total_sentence_count = sentence_count; + result.total_number_count = number_count; + result.total_forbidden_count = forbidden_count; + } +} + +int main(int argc, char *argv[]) { + if (argc < 2) { + std::cerr << "Usage: " << argv[0] << " <file1> <file2> ... <fileN>" << std::endl; + return 1; + } + + std::vector<std::thread> threads; + std::vector<AnalysisResult> results(argc - 1); + + for (int i = 1; i < argc; i++) { + results[i - 1].filename = argv[i]; + threads.emplace_back(analyze_file, std::ref(results[i - 1])); + } + + for (auto& thread : threads) { + thread.join(); + } + + long long total_word_count = 0; + long long total_capitalized_count = 0; + long long total_sentence_count = 0; + long long total_number_count = 0; + long long total_forbidden_count = 0; + + for (const auto& result : results) { + total_word_count += result.total_word_count; + total_capitalized_count += result.total_capitalized_count; + total_sentence_count += result.total_sentence_count; + total_number_count += result.total_number_count; + total_forbidden_count += result.total_forbidden_count; + } + + double capitalized_percentage = (total_word_count > 0) ? (static_cast<double>(total_capitalized_count) / total_word_count * 100.0) : 0; + double forbidden_percentage = (total_word_count > 0) ? (static_cast<double>(total_forbidden_count) / total_word_count * 100.0) : 0; + double word_count_per_sentence = (total_sentence_count > 0) ? (static_cast<double>(total_word_count) / total_sentence_count) : 0; + + std::cout << "\nTotal Words: " << total_word_count << std::endl; + std::cout << "Total Capitalized words: " << total_capitalized_count << std::endl; + std::cout << "Total Sentences: " << total_sentence_count << std::endl; + std::cout << "Total Numbers: " << total_number_count << std::endl; + std::cout << "Total Forbidden words: " << total_forbidden_count << std::endl; + std::cout << "Capitalized percentage: " << capitalized_percentage << "%" << std::endl; + std::cout << "Forbidden percentage: " << forbidden_percentage << "%" << std::endl; + std::cout << "Word count per sentence: " << word_count_per_sentence << std::endl; + std::cout << "Total files read: " << (argc - 1) << std::endl; + return 0; +}