diff --git a/Makefile b/Makefile index 0577c55..ea28121 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ build_cpp: build_borded_cpp: @echo "Compiling Borded C++ version of isspam." - @g++ -std=c++23 -Ofast borded_cpp/src/main.cpp -o borded_cpp_exec + @g++ -std=c++23 -Ofast borded_cpp/src/main3.cpp -o borded_cpp_exec -ltbb build_risspam: @echo "Compiling 12bitfloat_risspam project." diff --git a/bench.py b/bench.py index 64ca6fb..c32749e 100644 --- a/bench.py +++ b/bench.py @@ -3,12 +3,12 @@ import time print("***benchmarking***") time_start = time.time() -subprocess.check_output('./isspam books/*.txt', shell=True) -print("Time C:",time.time() - time_start) -time_start = time.time() subprocess.check_output('./risspam -p books/*.txt', shell=True) print("Time Rust:",time.time() - time_start) time_start = time.time() +subprocess.check_output('./isspam books/*.txt', shell=True) +print("Time C:",time.time() - time_start) +time_start = time.time() subprocess.check_output('./isspam_cpp books/*.txt', shell=True) print("Time CPP:",time.time() - time_start) time_start = time.time() diff --git a/borded_cpp/CMakeLists.txt b/borded_cpp/CMakeLists.txt index b5c4c72..0e5916a 100644 --- a/borded_cpp/CMakeLists.txt +++ b/borded_cpp/CMakeLists.txt @@ -15,7 +15,13 @@ else () add_compile_options(-Wall) add_compile_options(-Wextra) add_compile_options(-Wpedantic) - add_compile_options(-Werror) +# add_compile_options(-Werror) endif () -add_executable(${PROJECT_NAME} src/main.cpp) \ No newline at end of file +add_executable(${PROJECT_NAME} src/main.cpp) +add_executable(${PROJECT_NAME}3 src/main3.cpp) + +if (LINUX) + target_link_libraries(${PROJECT_NAME} tbb) + target_link_libraries(${PROJECT_NAME}3 tbb) +endif () \ No newline at end of file diff --git a/borded_cpp/src/main3.cpp b/borded_cpp/src/main3.cpp new file mode 100644 index 0000000..c379833 --- /dev/null +++ b/borded_cpp/src/main3.cpp @@ -0,0 +1,576 @@ +#include <string> +#include <string_view> +#include <fstream> +#include <algorithm> +#include <iostream> +#include <execution> +#include <format> +#include <cstdio> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <aio.h> +#include <condition_variable> +#include <unordered_set> +#include <sys/signal.h> + +#ifdef __cpp_lib_print +#include <print> +#else +namespace std { +template <typename T, typename... Args> +inline void print(T format, Args &&... args) { + auto f = std::vformat(format, std::make_format_args(args...)); + std::cout << f; +} + +template <typename T, typename... Args> +inline void println(T format, Args &&... args) { + auto f = std::vformat(format, std::make_format_args(args...)); + std::cout << f << std::endl; +} +} +#endif + +constexpr std::array<std::string_view, 35> BAD_WORDS = { + "recovery", + "techie", + "http", + "https", + "digital", + "hack", + "::", + "//", + "com", + "@", + "crypto", + "bitcoin", + "wallet", + "hacker", + "welcome", + "whatsapp", + "email", + "cryptocurrency", + "stolen", + "freeze", + "quick", + "crucial", + "tracing", + "scammers", + "expers", + "hire", + "century", + "transaction", + "essential", + "managing", + "contact", + "contacting", + "understanding", + "assets", + "funds", +}; +const std::unordered_set<std::string_view> BAD_WORDS_SET(BAD_WORDS.begin(), BAD_WORDS.end()); +static constexpr unsigned int crc_table[256] = { + 0x00000000, + 0x77073096, + 0xee0e612c, + 0x990951ba, + 0x076dc419, + 0x706af48f, + 0xe963a535, + 0x9e6495a3, + 0x0edb8832, + 0x79dcb8a4, + 0xe0d5e91e, + 0x97d2d988, + 0x09b64c2b, + 0x7eb17cbd, + 0xe7b82d07, + 0x90bf1d91, + 0x1db71064, + 0x6ab020f2, + 0xf3b97148, + 0x84be41de, + 0x1adad47d, + 0x6ddde4eb, + 0xf4d4b551, + 0x83d385c7, + 0x136c9856, + 0x646ba8c0, + 0xfd62f97a, + 0x8a65c9ec, + 0x14015c4f, + 0x63066cd9, + 0xfa0f3d63, + 0x8d080df5, + 0x3b6e20c8, + 0x4c69105e, + 0xd56041e4, + 0xa2677172, + 0x3c03e4d1, + 0x4b04d447, + 0xd20d85fd, + 0xa50ab56b, + 0x35b5a8fa, + 0x42b2986c, + 0xdbbbc9d6, + 0xacbcf940, + 0x32d86ce3, + 0x45df5c75, + 0xdcd60dcf, + 0xabd13d59, + 0x26d930ac, + 0x51de003a, + 0xc8d75180, + 0xbfd06116, + 0x21b4f4b5, + 0x56b3c423, + 0xcfba9599, + 0xb8bda50f, + 0x2802b89e, + 0x5f058808, + 0xc60cd9b2, + 0xb10be924, + 0x2f6f7c87, + 0x58684c11, + 0xc1611dab, + 0xb6662d3d, + 0x76dc4190, + 0x01db7106, + 0x98d220bc, + 0xefd5102a, + 0x71b18589, + 0x06b6b51f, + 0x9fbfe4a5, + 0xe8b8d433, + 0x7807c9a2, + 0x0f00f934, + 0x9609a88e, + 0xe10e9818, + 0x7f6a0dbb, + 0x086d3d2d, + 0x91646c97, + 0xe6635c01, + 0x6b6b51f4, + 0x1c6c6162, + 0x856530d8, + 0xf262004e, + 0x6c0695ed, + 0x1b01a57b, + 0x8208f4c1, + 0xf50fc457, + 0x65b0d9c6, + 0x12b7e950, + 0x8bbeb8ea, + 0xfcb9887c, + 0x62dd1ddf, + 0x15da2d49, + 0x8cd37cf3, + 0xfbd44c65, + 0x4db26158, + 0x3ab551ce, + 0xa3bc0074, + 0xd4bb30e2, + 0x4adfa541, + 0x3dd895d7, + 0xa4d1c46d, + 0xd3d6f4fb, + 0x4369e96a, + 0x346ed9fc, + 0xad678846, + 0xda60b8d0, + 0x44042d73, + 0x33031de5, + 0xaa0a4c5f, + 0xdd0d7cc9, + 0x5005713c, + 0x270241aa, + 0xbe0b1010, + 0xc90c2086, + 0x5768b525, + 0x206f85b3, + 0xb966d409, + 0xce61e49f, + 0x5edef90e, + 0x29d9c998, + 0xb0d09822, + 0xc7d7a8b4, + 0x59b33d17, + 0x2eb40d81, + 0xb7bd5c3b, + 0xc0ba6cad, + 0xedb88320, + 0x9abfb3b6, + 0x03b6e20c, + 0x74b1d29a, + 0xead54739, + 0x9dd277af, + 0x04db2615, + 0x73dc1683, + 0xe3630b12, + 0x94643b84, + 0x0d6d6a3e, + 0x7a6a5aa8, + 0xe40ecf0b, + 0x9309ff9d, + 0x0a00ae27, + 0x7d079eb1, + 0xf00f9344, + 0x8708a3d2, + 0x1e01f268, + 0x6906c2fe, + 0xf762575d, + 0x806567cb, + 0x196c3671, + 0x6e6b06e7, + 0xfed41b76, + 0x89d32be0, + 0x10da7a5a, + 0x67dd4acc, + 0xf9b9df6f, + 0x8ebeeff9, + 0x17b7be43, + 0x60b08ed5, + 0xd6d6a3e8, + 0xa1d1937e, + 0x38d8c2c4, + 0x4fdff252, + 0xd1bb67f1, + 0xa6bc5767, + 0x3fb506dd, + 0x48b2364b, + 0xd80d2bda, + 0xaf0a1b4c, + 0x36034af6, + 0x41047a60, + 0xdf60efc3, + 0xa867df55, + 0x316e8eef, + 0x4669be79, + 0xcb61b38c, + 0xbc66831a, + 0x256fd2a0, + 0x5268e236, + 0xcc0c7795, + 0xbb0b4703, + 0x220216b9, + 0x5505262f, + 0xc5ba3bbe, + 0xb2bd0b28, + 0x2bb45a92, + 0x5cb36a04, + 0xc2d7ffa7, + 0xb5d0cf31, + 0x2cd99e8b, + 0x5bdeae1d, + 0x9b64c2b0, + 0xec63f226, + 0x756aa39c, + 0x026d930a, + 0x9c0906a9, + 0xeb0e363f, + 0x72076785, + 0x05005713, + 0x95bf4a82, + 0xe2b87a14, + 0x7bb12bae, + 0x0cb61b38, + 0x92d28e9b, + 0xe5d5be0d, + 0x7cdcefb7, + 0x0bdbdf21, + 0x86d3d2d4, + 0xf1d4e242, + 0x68ddb3f8, + 0x1fda836e, + 0x81be16cd, + 0xf6b9265b, + 0x6fb077e1, + 0x18b74777, + 0x88085ae6, + 0xff0f6a70, + 0x66063bca, + 0x11010b5c, + 0x8f659eff, + 0xf862ae69, + 0x616bffd3, + 0x166ccf45, + 0xa00ae278, + 0xd70dd2ee, + 0x4e048354, + 0x3903b3c2, + 0xa7672661, + 0xd06016f7, + 0x4969474d, + 0x3e6e77db, + 0xaed16a4a, + 0xd9d65adc, + 0x40df0b66, + 0x37d83bf0, + 0xa9bcae53, + 0xdebb9ec5, + 0x47b2cf7f, + 0x30b5ffe9, + 0xbdbdf21c, + 0xcabac28a, + 0x53b39330, + 0x24b4a3a6, + 0xbad03605, + 0xcdd70693, + 0x54de5729, + 0x23d967bf, + 0xb3667a2e, + 0xc4614ab8, + 0x5d681b02, + 0x2a6f2b94, + 0xb40bbe37, + 0xc30c8ea1, + 0x5a05df1b, + 0x2d02ef8d +}; + +constexpr uint32_t crc32(std::string_view str) { + uint32_t crc = 0xffffffff; + for (auto c : str) + crc = (crc >> 8) ^ crc_table[(crc ^ c) & 0xff]; + return crc ^ 0xffffffff; +} + +constexpr uint32_t crc32(char const *str, const size_t size) { + uint32_t crc = 0xffffffff; + for (size_t i = 0; i < size; ++i) + crc = (crc >> 8) ^ crc_table[(crc ^ str[i]) & 0xff]; + return crc ^ 0xffffffff; +} + +constexpr std::array<uint32_t, 35> BAD_WORDS_HASH = { + crc32("recovery"), + crc32("techie"), + crc32("http"), + crc32("https"), + crc32("digital"), + crc32("hack"), + crc32("::"), + crc32("//"), + crc32("com"), + crc32("@"), + crc32("crypto"), + crc32("bitcoin"), + crc32("wallet"), + crc32("hacker"), + crc32("welcome"), + crc32("whatsapp"), + crc32("email"), + crc32("cryptocurrency"), + crc32("stolen"), + crc32("freeze"), + crc32("quick"), + crc32("crucial"), + crc32("tracing"), + crc32("scammers"), + crc32("expers"), + crc32("hire"), + crc32("century"), + crc32("transaction"), + crc32("essential"), + crc32("managing"), + crc32("contact"), + crc32("contacting"), + crc32("understanding"), + crc32("assets"), + crc32("funds") +}; +const std::unordered_set BAD_WORDS_STR(BAD_WORDS.begin(), BAD_WORDS.end()); + +constexpr auto SHORTEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::max(), + [](std::size_t current, const std::string_view &word) { + return std::min(current, word.size()); + } + ); +constexpr auto LONGEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::min(), + [](std::size_t current, const std::string_view &word) { + return std::max(current, word.size()); + } + ); + +int totalWordCount = 0; +int totalCapitalizedCount = 0; +int totalSentenceCount = 0; +int totalNumberCount = 0; +int totalForbiddenCount = 0; +int fileCount = 1; + +int failCount = 0; + + +int done = 0; + + +struct info { + std::string_view name; + aiocb *cb; + const std::chrono::time_point<std::chrono::steady_clock> start = std::chrono::steady_clock::now(); +}; + +constexpr void check_word_simple(const char *word, const ssize_t size) { + if (size < SHORTEST_BAD_WORD || size > LONGEST_BAD_WORD) { + return; + } + // if (BAD_WORDS_SET.contains(word)) { + // totalForbiddenCount++; + // } + + const auto hs = crc32(word, size); + + for (int i = 0; i < BAD_WORDS_HASH.size(); ++i) { + if (BAD_WORDS_HASH[i] == hs) { + totalForbiddenCount++; + return; + } + } +} + +void read_str(char *str, ssize_t size) { + int mark = -1; + + int fileWords = 0; + for (int pos = 0; pos <= size; ++pos) { + char *c = str + pos; + + if (*c == '.') { + totalSentenceCount++; + } + + if (*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t') { + if (mark != -1) { + check_word_simple(str + mark, pos - mark); + mark = -1; + } + } else if (mark == -1) { + ++fileWords; + if (*c >= 'A' && *c <= 'Z') { + totalCapitalizedCount++; + } + + mark = pos; + } else if (*c >= '0' && *c <= '9') { + totalNumberCount++; + for (; pos <= size; ++pos) { + c = str + pos; + if (*c == '.') { + totalSentenceCount++; + } + if (*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t') { + break; + } + } + mark = -1; + } + + } + + if (mark != -1) { + check_word_simple(str + mark, size - mark); + } + + totalWordCount += fileWords; +} + + +void aio_completion_handler(sigval_t sigval) { + + fileCount++; + info *data = (info *)sigval.sival_ptr; + auto req = data->cb; + // auto req = (struct aiocb *)sigval.sival_ptr; + /* Did the request complete? */ + auto error = aio_error(req); + if (error == 0) { + + /* Request completed successfully, get the return status */ + // const auto start{std::chrono::steady_clock::now()}; + // const std::chrono::duration<double> start_seconds{start - (data->start)}; + // std::println("File started {} in {}", data->name, start_seconds.count()); + read_str((char *)req->aio_buf, aio_return(req)); + // const auto finish{std::chrono::steady_clock::now()}; + // const std::chrono::duration<double> elapsed_seconds{finish - (data->start)}; + // std::println("File read {} in {}", data->name, elapsed_seconds.count()); + } else { + std::println("Error at aio_error ({}): ", error); + failCount++; + } + --done; + +} + +int main(const int argc, char *argv[]) { + if (argc < 2) { + std::println("Usage: {} <file1> <file2> ... <fileN>", argv[0]); + return 1; + } + + done = argc - 1; + + // lio_listio + auto aiocb_list = (struct aiocb *)malloc(sizeof(struct aiocb) * (argc - 1)); + auto aiocb_list_ptr = (struct aiocb **)malloc(sizeof(struct aiocb *) * (argc - 1)); + + // char *memchnk = (char *)malloc(5 * 1024 * 1024 * (argc - 1)); + + for (std::size_t i = 0; i < argc - 1; i++) { + aiocb_list[i].aio_fildes = open(argv[i + 1], O_RDONLY); + aiocb_list[i].aio_offset = 0; + // 5mb + aiocb_list[i].aio_buf = malloc(5 * 1024 * 1024); + aiocb_list[i].aio_nbytes = (5 * 1024 * 1024);; + + aiocb_list[i].aio_sigevent.sigev_notify = SIGEV_THREAD; + aiocb_list[i].aio_sigevent.sigev_notify_function = aio_completion_handler; + aiocb_list[i].aio_sigevent.sigev_notify_attributes = nullptr; + // aiocb_list[i].aio_sigevent.sigev_value.sival_ptr = &aiocb_list[i]; + aiocb_list[i].aio_sigevent.sigev_value.sival_ptr = new info{ + argv[i + 1], + &aiocb_list[i]}; + + // aiocb_list[i].aio_reqprio = SIGRTMIN; + + aiocb_list_ptr[i] = &aiocb_list[i]; + } + + lio_listio(LIO_WAIT, aiocb_list_ptr, argc - 1, nullptr); + + while (done > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + + std::println("Done reading files, {} done", done); + + double capitalizedPercentage = (totalWordCount > 0) + ? static_cast<double>(totalCapitalizedCount) / totalWordCount * 100.0 + : 0; + double forbiddenPercentage = (totalWordCount > 0) + ? static_cast<double>(totalForbiddenCount) / totalWordCount * 100.0 + : 0; + double wordCountPerSentence = (totalSentenceCount > 0) + ? static_cast<double>(totalWordCount) / totalSentenceCount + : 0; + + std::println( + "Word Count: {}\nCapitalized Count: {}\nSentence Count: {}\nNumber Count: {}\nForbidden Count: {}\nFile Count: {}\nFail Count: {}\nCapitalized Percentage: {}%\nForbidden Percentage: {}%\nWord Count Per Sentence: {}", + totalWordCount, totalCapitalizedCount, totalSentenceCount, totalNumberCount, totalForbiddenCount, fileCount, failCount, + capitalizedPercentage, forbiddenPercentage, wordCountPerSentence + ); + + for (std::size_t i = 0; i < argc - 1; i++) { + close(aiocb_list[i].aio_fildes); + free((void *)aiocb_list[i].aio_buf); + } + + free(aiocb_list); + free(aiocb_list_ptr); + // free(memchnk); + + if (failCount > 0) { + return 1; + } +}