parent
a08c181872
commit
75092cb738
2
Makefile
2
Makefile
@ -17,7 +17,7 @@ build_cpp:
|
||||
|
||||
build_borded_cpp:
|
||||
@echo "Compiling Borded C++ version of isspam."
|
||||
@g++ -std=c++23 -Ofast borded_cpp/src/main.cpp -o borded_cpp_exec
|
||||
@g++ -std=c++23 -Ofast borded_cpp/src/main3.cpp -o borded_cpp_exec -ltbb
|
||||
|
||||
build_risspam:
|
||||
@echo "Compiling 12bitfloat_risspam project."
|
||||
|
6
bench.py
6
bench.py
@ -3,12 +3,12 @@ import time
|
||||
|
||||
print("***benchmarking***")
|
||||
time_start = time.time()
|
||||
subprocess.check_output('./isspam books/*.txt', shell=True)
|
||||
print("Time C:",time.time() - time_start)
|
||||
time_start = time.time()
|
||||
subprocess.check_output('./risspam -p books/*.txt', shell=True)
|
||||
print("Time Rust:",time.time() - time_start)
|
||||
time_start = time.time()
|
||||
subprocess.check_output('./isspam books/*.txt', shell=True)
|
||||
print("Time C:",time.time() - time_start)
|
||||
time_start = time.time()
|
||||
subprocess.check_output('./isspam_cpp books/*.txt', shell=True)
|
||||
print("Time CPP:",time.time() - time_start)
|
||||
time_start = time.time()
|
||||
|
@ -15,7 +15,13 @@ else ()
|
||||
add_compile_options(-Wall)
|
||||
add_compile_options(-Wextra)
|
||||
add_compile_options(-Wpedantic)
|
||||
add_compile_options(-Werror)
|
||||
# add_compile_options(-Werror)
|
||||
endif ()
|
||||
|
||||
add_executable(${PROJECT_NAME} src/main.cpp)
|
||||
add_executable(${PROJECT_NAME} src/main.cpp)
|
||||
add_executable(${PROJECT_NAME}3 src/main3.cpp)
|
||||
|
||||
if (LINUX)
|
||||
target_link_libraries(${PROJECT_NAME} tbb)
|
||||
target_link_libraries(${PROJECT_NAME}3 tbb)
|
||||
endif ()
|
576
borded_cpp/src/main3.cpp
Normal file
576
borded_cpp/src/main3.cpp
Normal file
@ -0,0 +1,576 @@
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <execution>
|
||||
#include <format>
|
||||
#include <cstdio>
|
||||
#include <fcntl.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <aio.h>
|
||||
#include <condition_variable>
|
||||
#include <unordered_set>
|
||||
#include <sys/signal.h>
|
||||
|
||||
#ifdef __cpp_lib_print
|
||||
#include <print>
|
||||
#else
|
||||
namespace std {
|
||||
template <typename T, typename... Args>
|
||||
inline void print(T format, Args &&... args) {
|
||||
auto f = std::vformat(format, std::make_format_args(args...));
|
||||
std::cout << f;
|
||||
}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
inline void println(T format, Args &&... args) {
|
||||
auto f = std::vformat(format, std::make_format_args(args...));
|
||||
std::cout << f << std::endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
constexpr std::array<std::string_view, 35> BAD_WORDS = {
|
||||
"recovery",
|
||||
"techie",
|
||||
"http",
|
||||
"https",
|
||||
"digital",
|
||||
"hack",
|
||||
"::",
|
||||
"//",
|
||||
"com",
|
||||
"@",
|
||||
"crypto",
|
||||
"bitcoin",
|
||||
"wallet",
|
||||
"hacker",
|
||||
"welcome",
|
||||
"whatsapp",
|
||||
"email",
|
||||
"cryptocurrency",
|
||||
"stolen",
|
||||
"freeze",
|
||||
"quick",
|
||||
"crucial",
|
||||
"tracing",
|
||||
"scammers",
|
||||
"expers",
|
||||
"hire",
|
||||
"century",
|
||||
"transaction",
|
||||
"essential",
|
||||
"managing",
|
||||
"contact",
|
||||
"contacting",
|
||||
"understanding",
|
||||
"assets",
|
||||
"funds",
|
||||
};
|
||||
const std::unordered_set<std::string_view> BAD_WORDS_SET(BAD_WORDS.begin(), BAD_WORDS.end());
|
||||
static constexpr unsigned int crc_table[256] = {
|
||||
0x00000000,
|
||||
0x77073096,
|
||||
0xee0e612c,
|
||||
0x990951ba,
|
||||
0x076dc419,
|
||||
0x706af48f,
|
||||
0xe963a535,
|
||||
0x9e6495a3,
|
||||
0x0edb8832,
|
||||
0x79dcb8a4,
|
||||
0xe0d5e91e,
|
||||
0x97d2d988,
|
||||
0x09b64c2b,
|
||||
0x7eb17cbd,
|
||||
0xe7b82d07,
|
||||
0x90bf1d91,
|
||||
0x1db71064,
|
||||
0x6ab020f2,
|
||||
0xf3b97148,
|
||||
0x84be41de,
|
||||
0x1adad47d,
|
||||
0x6ddde4eb,
|
||||
0xf4d4b551,
|
||||
0x83d385c7,
|
||||
0x136c9856,
|
||||
0x646ba8c0,
|
||||
0xfd62f97a,
|
||||
0x8a65c9ec,
|
||||
0x14015c4f,
|
||||
0x63066cd9,
|
||||
0xfa0f3d63,
|
||||
0x8d080df5,
|
||||
0x3b6e20c8,
|
||||
0x4c69105e,
|
||||
0xd56041e4,
|
||||
0xa2677172,
|
||||
0x3c03e4d1,
|
||||
0x4b04d447,
|
||||
0xd20d85fd,
|
||||
0xa50ab56b,
|
||||
0x35b5a8fa,
|
||||
0x42b2986c,
|
||||
0xdbbbc9d6,
|
||||
0xacbcf940,
|
||||
0x32d86ce3,
|
||||
0x45df5c75,
|
||||
0xdcd60dcf,
|
||||
0xabd13d59,
|
||||
0x26d930ac,
|
||||
0x51de003a,
|
||||
0xc8d75180,
|
||||
0xbfd06116,
|
||||
0x21b4f4b5,
|
||||
0x56b3c423,
|
||||
0xcfba9599,
|
||||
0xb8bda50f,
|
||||
0x2802b89e,
|
||||
0x5f058808,
|
||||
0xc60cd9b2,
|
||||
0xb10be924,
|
||||
0x2f6f7c87,
|
||||
0x58684c11,
|
||||
0xc1611dab,
|
||||
0xb6662d3d,
|
||||
0x76dc4190,
|
||||
0x01db7106,
|
||||
0x98d220bc,
|
||||
0xefd5102a,
|
||||
0x71b18589,
|
||||
0x06b6b51f,
|
||||
0x9fbfe4a5,
|
||||
0xe8b8d433,
|
||||
0x7807c9a2,
|
||||
0x0f00f934,
|
||||
0x9609a88e,
|
||||
0xe10e9818,
|
||||
0x7f6a0dbb,
|
||||
0x086d3d2d,
|
||||
0x91646c97,
|
||||
0xe6635c01,
|
||||
0x6b6b51f4,
|
||||
0x1c6c6162,
|
||||
0x856530d8,
|
||||
0xf262004e,
|
||||
0x6c0695ed,
|
||||
0x1b01a57b,
|
||||
0x8208f4c1,
|
||||
0xf50fc457,
|
||||
0x65b0d9c6,
|
||||
0x12b7e950,
|
||||
0x8bbeb8ea,
|
||||
0xfcb9887c,
|
||||
0x62dd1ddf,
|
||||
0x15da2d49,
|
||||
0x8cd37cf3,
|
||||
0xfbd44c65,
|
||||
0x4db26158,
|
||||
0x3ab551ce,
|
||||
0xa3bc0074,
|
||||
0xd4bb30e2,
|
||||
0x4adfa541,
|
||||
0x3dd895d7,
|
||||
0xa4d1c46d,
|
||||
0xd3d6f4fb,
|
||||
0x4369e96a,
|
||||
0x346ed9fc,
|
||||
0xad678846,
|
||||
0xda60b8d0,
|
||||
0x44042d73,
|
||||
0x33031de5,
|
||||
0xaa0a4c5f,
|
||||
0xdd0d7cc9,
|
||||
0x5005713c,
|
||||
0x270241aa,
|
||||
0xbe0b1010,
|
||||
0xc90c2086,
|
||||
0x5768b525,
|
||||
0x206f85b3,
|
||||
0xb966d409,
|
||||
0xce61e49f,
|
||||
0x5edef90e,
|
||||
0x29d9c998,
|
||||
0xb0d09822,
|
||||
0xc7d7a8b4,
|
||||
0x59b33d17,
|
||||
0x2eb40d81,
|
||||
0xb7bd5c3b,
|
||||
0xc0ba6cad,
|
||||
0xedb88320,
|
||||
0x9abfb3b6,
|
||||
0x03b6e20c,
|
||||
0x74b1d29a,
|
||||
0xead54739,
|
||||
0x9dd277af,
|
||||
0x04db2615,
|
||||
0x73dc1683,
|
||||
0xe3630b12,
|
||||
0x94643b84,
|
||||
0x0d6d6a3e,
|
||||
0x7a6a5aa8,
|
||||
0xe40ecf0b,
|
||||
0x9309ff9d,
|
||||
0x0a00ae27,
|
||||
0x7d079eb1,
|
||||
0xf00f9344,
|
||||
0x8708a3d2,
|
||||
0x1e01f268,
|
||||
0x6906c2fe,
|
||||
0xf762575d,
|
||||
0x806567cb,
|
||||
0x196c3671,
|
||||
0x6e6b06e7,
|
||||
0xfed41b76,
|
||||
0x89d32be0,
|
||||
0x10da7a5a,
|
||||
0x67dd4acc,
|
||||
0xf9b9df6f,
|
||||
0x8ebeeff9,
|
||||
0x17b7be43,
|
||||
0x60b08ed5,
|
||||
0xd6d6a3e8,
|
||||
0xa1d1937e,
|
||||
0x38d8c2c4,
|
||||
0x4fdff252,
|
||||
0xd1bb67f1,
|
||||
0xa6bc5767,
|
||||
0x3fb506dd,
|
||||
0x48b2364b,
|
||||
0xd80d2bda,
|
||||
0xaf0a1b4c,
|
||||
0x36034af6,
|
||||
0x41047a60,
|
||||
0xdf60efc3,
|
||||
0xa867df55,
|
||||
0x316e8eef,
|
||||
0x4669be79,
|
||||
0xcb61b38c,
|
||||
0xbc66831a,
|
||||
0x256fd2a0,
|
||||
0x5268e236,
|
||||
0xcc0c7795,
|
||||
0xbb0b4703,
|
||||
0x220216b9,
|
||||
0x5505262f,
|
||||
0xc5ba3bbe,
|
||||
0xb2bd0b28,
|
||||
0x2bb45a92,
|
||||
0x5cb36a04,
|
||||
0xc2d7ffa7,
|
||||
0xb5d0cf31,
|
||||
0x2cd99e8b,
|
||||
0x5bdeae1d,
|
||||
0x9b64c2b0,
|
||||
0xec63f226,
|
||||
0x756aa39c,
|
||||
0x026d930a,
|
||||
0x9c0906a9,
|
||||
0xeb0e363f,
|
||||
0x72076785,
|
||||
0x05005713,
|
||||
0x95bf4a82,
|
||||
0xe2b87a14,
|
||||
0x7bb12bae,
|
||||
0x0cb61b38,
|
||||
0x92d28e9b,
|
||||
0xe5d5be0d,
|
||||
0x7cdcefb7,
|
||||
0x0bdbdf21,
|
||||
0x86d3d2d4,
|
||||
0xf1d4e242,
|
||||
0x68ddb3f8,
|
||||
0x1fda836e,
|
||||
0x81be16cd,
|
||||
0xf6b9265b,
|
||||
0x6fb077e1,
|
||||
0x18b74777,
|
||||
0x88085ae6,
|
||||
0xff0f6a70,
|
||||
0x66063bca,
|
||||
0x11010b5c,
|
||||
0x8f659eff,
|
||||
0xf862ae69,
|
||||
0x616bffd3,
|
||||
0x166ccf45,
|
||||
0xa00ae278,
|
||||
0xd70dd2ee,
|
||||
0x4e048354,
|
||||
0x3903b3c2,
|
||||
0xa7672661,
|
||||
0xd06016f7,
|
||||
0x4969474d,
|
||||
0x3e6e77db,
|
||||
0xaed16a4a,
|
||||
0xd9d65adc,
|
||||
0x40df0b66,
|
||||
0x37d83bf0,
|
||||
0xa9bcae53,
|
||||
0xdebb9ec5,
|
||||
0x47b2cf7f,
|
||||
0x30b5ffe9,
|
||||
0xbdbdf21c,
|
||||
0xcabac28a,
|
||||
0x53b39330,
|
||||
0x24b4a3a6,
|
||||
0xbad03605,
|
||||
0xcdd70693,
|
||||
0x54de5729,
|
||||
0x23d967bf,
|
||||
0xb3667a2e,
|
||||
0xc4614ab8,
|
||||
0x5d681b02,
|
||||
0x2a6f2b94,
|
||||
0xb40bbe37,
|
||||
0xc30c8ea1,
|
||||
0x5a05df1b,
|
||||
0x2d02ef8d
|
||||
};
|
||||
|
||||
constexpr uint32_t crc32(std::string_view str) {
|
||||
uint32_t crc = 0xffffffff;
|
||||
for (auto c : str)
|
||||
crc = (crc >> 8) ^ crc_table[(crc ^ c) & 0xff];
|
||||
return crc ^ 0xffffffff;
|
||||
}
|
||||
|
||||
constexpr uint32_t crc32(char const *str, const size_t size) {
|
||||
uint32_t crc = 0xffffffff;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
crc = (crc >> 8) ^ crc_table[(crc ^ str[i]) & 0xff];
|
||||
return crc ^ 0xffffffff;
|
||||
}
|
||||
|
||||
constexpr std::array<uint32_t, 35> BAD_WORDS_HASH = {
|
||||
crc32("recovery"),
|
||||
crc32("techie"),
|
||||
crc32("http"),
|
||||
crc32("https"),
|
||||
crc32("digital"),
|
||||
crc32("hack"),
|
||||
crc32("::"),
|
||||
crc32("//"),
|
||||
crc32("com"),
|
||||
crc32("@"),
|
||||
crc32("crypto"),
|
||||
crc32("bitcoin"),
|
||||
crc32("wallet"),
|
||||
crc32("hacker"),
|
||||
crc32("welcome"),
|
||||
crc32("whatsapp"),
|
||||
crc32("email"),
|
||||
crc32("cryptocurrency"),
|
||||
crc32("stolen"),
|
||||
crc32("freeze"),
|
||||
crc32("quick"),
|
||||
crc32("crucial"),
|
||||
crc32("tracing"),
|
||||
crc32("scammers"),
|
||||
crc32("expers"),
|
||||
crc32("hire"),
|
||||
crc32("century"),
|
||||
crc32("transaction"),
|
||||
crc32("essential"),
|
||||
crc32("managing"),
|
||||
crc32("contact"),
|
||||
crc32("contacting"),
|
||||
crc32("understanding"),
|
||||
crc32("assets"),
|
||||
crc32("funds")
|
||||
};
|
||||
const std::unordered_set BAD_WORDS_STR(BAD_WORDS.begin(), BAD_WORDS.end());
|
||||
|
||||
constexpr auto SHORTEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::max(),
|
||||
[](std::size_t current, const std::string_view &word) {
|
||||
return std::min(current, word.size());
|
||||
}
|
||||
);
|
||||
constexpr auto LONGEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::min(),
|
||||
[](std::size_t current, const std::string_view &word) {
|
||||
return std::max(current, word.size());
|
||||
}
|
||||
);
|
||||
|
||||
int totalWordCount = 0;
|
||||
int totalCapitalizedCount = 0;
|
||||
int totalSentenceCount = 0;
|
||||
int totalNumberCount = 0;
|
||||
int totalForbiddenCount = 0;
|
||||
int fileCount = 1;
|
||||
|
||||
int failCount = 0;
|
||||
|
||||
|
||||
int done = 0;
|
||||
|
||||
|
||||
struct info {
|
||||
std::string_view name;
|
||||
aiocb *cb;
|
||||
const std::chrono::time_point<std::chrono::steady_clock> start = std::chrono::steady_clock::now();
|
||||
};
|
||||
|
||||
constexpr void check_word_simple(const char *word, const ssize_t size) {
|
||||
if (size < SHORTEST_BAD_WORD || size > LONGEST_BAD_WORD) {
|
||||
return;
|
||||
}
|
||||
// if (BAD_WORDS_SET.contains(word)) {
|
||||
// totalForbiddenCount++;
|
||||
// }
|
||||
|
||||
const auto hs = crc32(word, size);
|
||||
|
||||
for (int i = 0; i < BAD_WORDS_HASH.size(); ++i) {
|
||||
if (BAD_WORDS_HASH[i] == hs) {
|
||||
totalForbiddenCount++;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void read_str(char *str, ssize_t size) {
|
||||
int mark = -1;
|
||||
|
||||
int fileWords = 0;
|
||||
for (int pos = 0; pos <= size; ++pos) {
|
||||
char *c = str + pos;
|
||||
|
||||
if (*c == '.') {
|
||||
totalSentenceCount++;
|
||||
}
|
||||
|
||||
if (*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t') {
|
||||
if (mark != -1) {
|
||||
check_word_simple(str + mark, pos - mark);
|
||||
mark = -1;
|
||||
}
|
||||
} else if (mark == -1) {
|
||||
++fileWords;
|
||||
if (*c >= 'A' && *c <= 'Z') {
|
||||
totalCapitalizedCount++;
|
||||
}
|
||||
|
||||
mark = pos;
|
||||
} else if (*c >= '0' && *c <= '9') {
|
||||
totalNumberCount++;
|
||||
for (; pos <= size; ++pos) {
|
||||
c = str + pos;
|
||||
if (*c == '.') {
|
||||
totalSentenceCount++;
|
||||
}
|
||||
if (*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
mark = -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (mark != -1) {
|
||||
check_word_simple(str + mark, size - mark);
|
||||
}
|
||||
|
||||
totalWordCount += fileWords;
|
||||
}
|
||||
|
||||
|
||||
void aio_completion_handler(sigval_t sigval) {
|
||||
|
||||
fileCount++;
|
||||
info *data = (info *)sigval.sival_ptr;
|
||||
auto req = data->cb;
|
||||
// auto req = (struct aiocb *)sigval.sival_ptr;
|
||||
/* Did the request complete? */
|
||||
auto error = aio_error(req);
|
||||
if (error == 0) {
|
||||
|
||||
/* Request completed successfully, get the return status */
|
||||
// const auto start{std::chrono::steady_clock::now()};
|
||||
// const std::chrono::duration<double> start_seconds{start - (data->start)};
|
||||
// std::println("File started {} in {}", data->name, start_seconds.count());
|
||||
read_str((char *)req->aio_buf, aio_return(req));
|
||||
// const auto finish{std::chrono::steady_clock::now()};
|
||||
// const std::chrono::duration<double> elapsed_seconds{finish - (data->start)};
|
||||
// std::println("File read {} in {}", data->name, elapsed_seconds.count());
|
||||
} else {
|
||||
std::println("Error at aio_error ({}): ", error);
|
||||
failCount++;
|
||||
}
|
||||
--done;
|
||||
|
||||
}
|
||||
|
||||
int main(const int argc, char *argv[]) {
|
||||
if (argc < 2) {
|
||||
std::println("Usage: {} <file1> <file2> ... <fileN>", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
done = argc - 1;
|
||||
|
||||
// lio_listio
|
||||
auto aiocb_list = (struct aiocb *)malloc(sizeof(struct aiocb) * (argc - 1));
|
||||
auto aiocb_list_ptr = (struct aiocb **)malloc(sizeof(struct aiocb *) * (argc - 1));
|
||||
|
||||
// char *memchnk = (char *)malloc(5 * 1024 * 1024 * (argc - 1));
|
||||
|
||||
for (std::size_t i = 0; i < argc - 1; i++) {
|
||||
aiocb_list[i].aio_fildes = open(argv[i + 1], O_RDONLY);
|
||||
aiocb_list[i].aio_offset = 0;
|
||||
// 5mb
|
||||
aiocb_list[i].aio_buf = malloc(5 * 1024 * 1024);
|
||||
aiocb_list[i].aio_nbytes = (5 * 1024 * 1024);;
|
||||
|
||||
aiocb_list[i].aio_sigevent.sigev_notify = SIGEV_THREAD;
|
||||
aiocb_list[i].aio_sigevent.sigev_notify_function = aio_completion_handler;
|
||||
aiocb_list[i].aio_sigevent.sigev_notify_attributes = nullptr;
|
||||
// aiocb_list[i].aio_sigevent.sigev_value.sival_ptr = &aiocb_list[i];
|
||||
aiocb_list[i].aio_sigevent.sigev_value.sival_ptr = new info{
|
||||
argv[i + 1],
|
||||
&aiocb_list[i]};
|
||||
|
||||
// aiocb_list[i].aio_reqprio = SIGRTMIN;
|
||||
|
||||
aiocb_list_ptr[i] = &aiocb_list[i];
|
||||
}
|
||||
|
||||
lio_listio(LIO_WAIT, aiocb_list_ptr, argc - 1, nullptr);
|
||||
|
||||
while (done > 0) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
}
|
||||
|
||||
std::println("Done reading files, {} done", done);
|
||||
|
||||
double capitalizedPercentage = (totalWordCount > 0)
|
||||
? static_cast<double>(totalCapitalizedCount) / totalWordCount * 100.0
|
||||
: 0;
|
||||
double forbiddenPercentage = (totalWordCount > 0)
|
||||
? static_cast<double>(totalForbiddenCount) / totalWordCount * 100.0
|
||||
: 0;
|
||||
double wordCountPerSentence = (totalSentenceCount > 0)
|
||||
? static_cast<double>(totalWordCount) / totalSentenceCount
|
||||
: 0;
|
||||
|
||||
std::println(
|
||||
"Word Count: {}\nCapitalized Count: {}\nSentence Count: {}\nNumber Count: {}\nForbidden Count: {}\nFile Count: {}\nFail Count: {}\nCapitalized Percentage: {}%\nForbidden Percentage: {}%\nWord Count Per Sentence: {}",
|
||||
totalWordCount, totalCapitalizedCount, totalSentenceCount, totalNumberCount, totalForbiddenCount, fileCount, failCount,
|
||||
capitalizedPercentage, forbiddenPercentage, wordCountPerSentence
|
||||
);
|
||||
|
||||
for (std::size_t i = 0; i < argc - 1; i++) {
|
||||
close(aiocb_list[i].aio_fildes);
|
||||
free((void *)aiocb_list[i].aio_buf);
|
||||
}
|
||||
|
||||
free(aiocb_list);
|
||||
free(aiocb_list_ptr);
|
||||
// free(memchnk);
|
||||
|
||||
if (failCount > 0) {
|
||||
return 1;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user