Updated file io perf

This commit is contained in:
BordedDev 2025-03-23 22:06:48 +01:00
parent a08c181872
commit 75092cb738
No known key found for this signature in database
GPG Key ID: C5F495EAE56673BF
4 changed files with 588 additions and 6 deletions

View File

@ -17,7 +17,7 @@ build_cpp:
build_borded_cpp:
@echo "Compiling Borded C++ version of isspam."
@g++ -std=c++23 -Ofast borded_cpp/src/main.cpp -o borded_cpp_exec
@g++ -std=c++23 -Ofast borded_cpp/src/main3.cpp -o borded_cpp_exec -ltbb
build_risspam:
@echo "Compiling 12bitfloat_risspam project."

View File

@ -3,12 +3,12 @@ import time
print("***benchmarking***")
time_start = time.time()
subprocess.check_output('./isspam books/*.txt', shell=True)
print("Time C:",time.time() - time_start)
time_start = time.time()
subprocess.check_output('./risspam -p books/*.txt', shell=True)
print("Time Rust:",time.time() - time_start)
time_start = time.time()
subprocess.check_output('./isspam books/*.txt', shell=True)
print("Time C:",time.time() - time_start)
time_start = time.time()
subprocess.check_output('./isspam_cpp books/*.txt', shell=True)
print("Time CPP:",time.time() - time_start)
time_start = time.time()

View File

@ -15,7 +15,13 @@ else ()
add_compile_options(-Wall)
add_compile_options(-Wextra)
add_compile_options(-Wpedantic)
add_compile_options(-Werror)
# add_compile_options(-Werror)
endif ()
add_executable(${PROJECT_NAME} src/main.cpp)
add_executable(${PROJECT_NAME} src/main.cpp)
add_executable(${PROJECT_NAME}3 src/main3.cpp)
if (LINUX)
target_link_libraries(${PROJECT_NAME} tbb)
target_link_libraries(${PROJECT_NAME}3 tbb)
endif ()

576
borded_cpp/src/main3.cpp Normal file
View File

@ -0,0 +1,576 @@
#include <string>
#include <string_view>
#include <fstream>
#include <algorithm>
#include <iostream>
#include <execution>
#include <format>
#include <cstdio>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <aio.h>
#include <condition_variable>
#include <unordered_set>
#include <sys/signal.h>
#ifdef __cpp_lib_print
#include <print>
#else
namespace std {
template <typename T, typename... Args>
inline void print(T format, Args &&... args) {
auto f = std::vformat(format, std::make_format_args(args...));
std::cout << f;
}
template <typename T, typename... Args>
inline void println(T format, Args &&... args) {
auto f = std::vformat(format, std::make_format_args(args...));
std::cout << f << std::endl;
}
}
#endif
constexpr std::array<std::string_view, 35> BAD_WORDS = {
"recovery",
"techie",
"http",
"https",
"digital",
"hack",
"::",
"//",
"com",
"@",
"crypto",
"bitcoin",
"wallet",
"hacker",
"welcome",
"whatsapp",
"email",
"cryptocurrency",
"stolen",
"freeze",
"quick",
"crucial",
"tracing",
"scammers",
"expers",
"hire",
"century",
"transaction",
"essential",
"managing",
"contact",
"contacting",
"understanding",
"assets",
"funds",
};
const std::unordered_set<std::string_view> BAD_WORDS_SET(BAD_WORDS.begin(), BAD_WORDS.end());
static constexpr unsigned int crc_table[256] = {
0x00000000,
0x77073096,
0xee0e612c,
0x990951ba,
0x076dc419,
0x706af48f,
0xe963a535,
0x9e6495a3,
0x0edb8832,
0x79dcb8a4,
0xe0d5e91e,
0x97d2d988,
0x09b64c2b,
0x7eb17cbd,
0xe7b82d07,
0x90bf1d91,
0x1db71064,
0x6ab020f2,
0xf3b97148,
0x84be41de,
0x1adad47d,
0x6ddde4eb,
0xf4d4b551,
0x83d385c7,
0x136c9856,
0x646ba8c0,
0xfd62f97a,
0x8a65c9ec,
0x14015c4f,
0x63066cd9,
0xfa0f3d63,
0x8d080df5,
0x3b6e20c8,
0x4c69105e,
0xd56041e4,
0xa2677172,
0x3c03e4d1,
0x4b04d447,
0xd20d85fd,
0xa50ab56b,
0x35b5a8fa,
0x42b2986c,
0xdbbbc9d6,
0xacbcf940,
0x32d86ce3,
0x45df5c75,
0xdcd60dcf,
0xabd13d59,
0x26d930ac,
0x51de003a,
0xc8d75180,
0xbfd06116,
0x21b4f4b5,
0x56b3c423,
0xcfba9599,
0xb8bda50f,
0x2802b89e,
0x5f058808,
0xc60cd9b2,
0xb10be924,
0x2f6f7c87,
0x58684c11,
0xc1611dab,
0xb6662d3d,
0x76dc4190,
0x01db7106,
0x98d220bc,
0xefd5102a,
0x71b18589,
0x06b6b51f,
0x9fbfe4a5,
0xe8b8d433,
0x7807c9a2,
0x0f00f934,
0x9609a88e,
0xe10e9818,
0x7f6a0dbb,
0x086d3d2d,
0x91646c97,
0xe6635c01,
0x6b6b51f4,
0x1c6c6162,
0x856530d8,
0xf262004e,
0x6c0695ed,
0x1b01a57b,
0x8208f4c1,
0xf50fc457,
0x65b0d9c6,
0x12b7e950,
0x8bbeb8ea,
0xfcb9887c,
0x62dd1ddf,
0x15da2d49,
0x8cd37cf3,
0xfbd44c65,
0x4db26158,
0x3ab551ce,
0xa3bc0074,
0xd4bb30e2,
0x4adfa541,
0x3dd895d7,
0xa4d1c46d,
0xd3d6f4fb,
0x4369e96a,
0x346ed9fc,
0xad678846,
0xda60b8d0,
0x44042d73,
0x33031de5,
0xaa0a4c5f,
0xdd0d7cc9,
0x5005713c,
0x270241aa,
0xbe0b1010,
0xc90c2086,
0x5768b525,
0x206f85b3,
0xb966d409,
0xce61e49f,
0x5edef90e,
0x29d9c998,
0xb0d09822,
0xc7d7a8b4,
0x59b33d17,
0x2eb40d81,
0xb7bd5c3b,
0xc0ba6cad,
0xedb88320,
0x9abfb3b6,
0x03b6e20c,
0x74b1d29a,
0xead54739,
0x9dd277af,
0x04db2615,
0x73dc1683,
0xe3630b12,
0x94643b84,
0x0d6d6a3e,
0x7a6a5aa8,
0xe40ecf0b,
0x9309ff9d,
0x0a00ae27,
0x7d079eb1,
0xf00f9344,
0x8708a3d2,
0x1e01f268,
0x6906c2fe,
0xf762575d,
0x806567cb,
0x196c3671,
0x6e6b06e7,
0xfed41b76,
0x89d32be0,
0x10da7a5a,
0x67dd4acc,
0xf9b9df6f,
0x8ebeeff9,
0x17b7be43,
0x60b08ed5,
0xd6d6a3e8,
0xa1d1937e,
0x38d8c2c4,
0x4fdff252,
0xd1bb67f1,
0xa6bc5767,
0x3fb506dd,
0x48b2364b,
0xd80d2bda,
0xaf0a1b4c,
0x36034af6,
0x41047a60,
0xdf60efc3,
0xa867df55,
0x316e8eef,
0x4669be79,
0xcb61b38c,
0xbc66831a,
0x256fd2a0,
0x5268e236,
0xcc0c7795,
0xbb0b4703,
0x220216b9,
0x5505262f,
0xc5ba3bbe,
0xb2bd0b28,
0x2bb45a92,
0x5cb36a04,
0xc2d7ffa7,
0xb5d0cf31,
0x2cd99e8b,
0x5bdeae1d,
0x9b64c2b0,
0xec63f226,
0x756aa39c,
0x026d930a,
0x9c0906a9,
0xeb0e363f,
0x72076785,
0x05005713,
0x95bf4a82,
0xe2b87a14,
0x7bb12bae,
0x0cb61b38,
0x92d28e9b,
0xe5d5be0d,
0x7cdcefb7,
0x0bdbdf21,
0x86d3d2d4,
0xf1d4e242,
0x68ddb3f8,
0x1fda836e,
0x81be16cd,
0xf6b9265b,
0x6fb077e1,
0x18b74777,
0x88085ae6,
0xff0f6a70,
0x66063bca,
0x11010b5c,
0x8f659eff,
0xf862ae69,
0x616bffd3,
0x166ccf45,
0xa00ae278,
0xd70dd2ee,
0x4e048354,
0x3903b3c2,
0xa7672661,
0xd06016f7,
0x4969474d,
0x3e6e77db,
0xaed16a4a,
0xd9d65adc,
0x40df0b66,
0x37d83bf0,
0xa9bcae53,
0xdebb9ec5,
0x47b2cf7f,
0x30b5ffe9,
0xbdbdf21c,
0xcabac28a,
0x53b39330,
0x24b4a3a6,
0xbad03605,
0xcdd70693,
0x54de5729,
0x23d967bf,
0xb3667a2e,
0xc4614ab8,
0x5d681b02,
0x2a6f2b94,
0xb40bbe37,
0xc30c8ea1,
0x5a05df1b,
0x2d02ef8d
};
constexpr uint32_t crc32(std::string_view str) {
uint32_t crc = 0xffffffff;
for (auto c : str)
crc = (crc >> 8) ^ crc_table[(crc ^ c) & 0xff];
return crc ^ 0xffffffff;
}
constexpr uint32_t crc32(char const *str, const size_t size) {
uint32_t crc = 0xffffffff;
for (size_t i = 0; i < size; ++i)
crc = (crc >> 8) ^ crc_table[(crc ^ str[i]) & 0xff];
return crc ^ 0xffffffff;
}
constexpr std::array<uint32_t, 35> BAD_WORDS_HASH = {
crc32("recovery"),
crc32("techie"),
crc32("http"),
crc32("https"),
crc32("digital"),
crc32("hack"),
crc32("::"),
crc32("//"),
crc32("com"),
crc32("@"),
crc32("crypto"),
crc32("bitcoin"),
crc32("wallet"),
crc32("hacker"),
crc32("welcome"),
crc32("whatsapp"),
crc32("email"),
crc32("cryptocurrency"),
crc32("stolen"),
crc32("freeze"),
crc32("quick"),
crc32("crucial"),
crc32("tracing"),
crc32("scammers"),
crc32("expers"),
crc32("hire"),
crc32("century"),
crc32("transaction"),
crc32("essential"),
crc32("managing"),
crc32("contact"),
crc32("contacting"),
crc32("understanding"),
crc32("assets"),
crc32("funds")
};
const std::unordered_set BAD_WORDS_STR(BAD_WORDS.begin(), BAD_WORDS.end());
constexpr auto SHORTEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::max(),
[](std::size_t current, const std::string_view &word) {
return std::min(current, word.size());
}
);
constexpr auto LONGEST_BAD_WORD = std::ranges::fold_left(BAD_WORDS, std::numeric_limits<std::size_t>::min(),
[](std::size_t current, const std::string_view &word) {
return std::max(current, word.size());
}
);
int totalWordCount = 0;
int totalCapitalizedCount = 0;
int totalSentenceCount = 0;
int totalNumberCount = 0;
int totalForbiddenCount = 0;
int fileCount = 1;
int failCount = 0;
int done = 0;
struct info {
std::string_view name;
aiocb *cb;
const std::chrono::time_point<std::chrono::steady_clock> start = std::chrono::steady_clock::now();
};
constexpr void check_word_simple(const char *word, const ssize_t size) {
if (size < SHORTEST_BAD_WORD || size > LONGEST_BAD_WORD) {
return;
}
// if (BAD_WORDS_SET.contains(word)) {
// totalForbiddenCount++;
// }
const auto hs = crc32(word, size);
for (int i = 0; i < BAD_WORDS_HASH.size(); ++i) {
if (BAD_WORDS_HASH[i] == hs) {
totalForbiddenCount++;
return;
}
}
}
void read_str(char *str, ssize_t size) {
int mark = -1;
int fileWords = 0;
for (int pos = 0; pos <= size; ++pos) {
char *c = str + pos;
if (*c == '.') {
totalSentenceCount++;
}
if (*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t') {
if (mark != -1) {
check_word_simple(str + mark, pos - mark);
mark = -1;
}
} else if (mark == -1) {
++fileWords;
if (*c >= 'A' && *c <= 'Z') {
totalCapitalizedCount++;
}
mark = pos;
} else if (*c >= '0' && *c <= '9') {
totalNumberCount++;
for (; pos <= size; ++pos) {
c = str + pos;
if (*c == '.') {
totalSentenceCount++;
}
if (*c == ' ' || *c == '\n' || *c == '\r' || *c == '\t') {
break;
}
}
mark = -1;
}
}
if (mark != -1) {
check_word_simple(str + mark, size - mark);
}
totalWordCount += fileWords;
}
void aio_completion_handler(sigval_t sigval) {
fileCount++;
info *data = (info *)sigval.sival_ptr;
auto req = data->cb;
// auto req = (struct aiocb *)sigval.sival_ptr;
/* Did the request complete? */
auto error = aio_error(req);
if (error == 0) {
/* Request completed successfully, get the return status */
// const auto start{std::chrono::steady_clock::now()};
// const std::chrono::duration<double> start_seconds{start - (data->start)};
// std::println("File started {} in {}", data->name, start_seconds.count());
read_str((char *)req->aio_buf, aio_return(req));
// const auto finish{std::chrono::steady_clock::now()};
// const std::chrono::duration<double> elapsed_seconds{finish - (data->start)};
// std::println("File read {} in {}", data->name, elapsed_seconds.count());
} else {
std::println("Error at aio_error ({}): ", error);
failCount++;
}
--done;
}
int main(const int argc, char *argv[]) {
if (argc < 2) {
std::println("Usage: {} <file1> <file2> ... <fileN>", argv[0]);
return 1;
}
done = argc - 1;
// lio_listio
auto aiocb_list = (struct aiocb *)malloc(sizeof(struct aiocb) * (argc - 1));
auto aiocb_list_ptr = (struct aiocb **)malloc(sizeof(struct aiocb *) * (argc - 1));
// char *memchnk = (char *)malloc(5 * 1024 * 1024 * (argc - 1));
for (std::size_t i = 0; i < argc - 1; i++) {
aiocb_list[i].aio_fildes = open(argv[i + 1], O_RDONLY);
aiocb_list[i].aio_offset = 0;
// 5mb
aiocb_list[i].aio_buf = malloc(5 * 1024 * 1024);
aiocb_list[i].aio_nbytes = (5 * 1024 * 1024);;
aiocb_list[i].aio_sigevent.sigev_notify = SIGEV_THREAD;
aiocb_list[i].aio_sigevent.sigev_notify_function = aio_completion_handler;
aiocb_list[i].aio_sigevent.sigev_notify_attributes = nullptr;
// aiocb_list[i].aio_sigevent.sigev_value.sival_ptr = &aiocb_list[i];
aiocb_list[i].aio_sigevent.sigev_value.sival_ptr = new info{
argv[i + 1],
&aiocb_list[i]};
// aiocb_list[i].aio_reqprio = SIGRTMIN;
aiocb_list_ptr[i] = &aiocb_list[i];
}
lio_listio(LIO_WAIT, aiocb_list_ptr, argc - 1, nullptr);
while (done > 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
std::println("Done reading files, {} done", done);
double capitalizedPercentage = (totalWordCount > 0)
? static_cast<double>(totalCapitalizedCount) / totalWordCount * 100.0
: 0;
double forbiddenPercentage = (totalWordCount > 0)
? static_cast<double>(totalForbiddenCount) / totalWordCount * 100.0
: 0;
double wordCountPerSentence = (totalSentenceCount > 0)
? static_cast<double>(totalWordCount) / totalSentenceCount
: 0;
std::println(
"Word Count: {}\nCapitalized Count: {}\nSentence Count: {}\nNumber Count: {}\nForbidden Count: {}\nFile Count: {}\nFail Count: {}\nCapitalized Percentage: {}%\nForbidden Percentage: {}%\nWord Count Per Sentence: {}",
totalWordCount, totalCapitalizedCount, totalSentenceCount, totalNumberCount, totalForbiddenCount, fileCount, failCount,
capitalizedPercentage, forbiddenPercentage, wordCountPerSentence
);
for (std::size_t i = 0; i < argc - 1; i++) {
close(aiocb_list[i].aio_fildes);
free((void *)aiocb_list[i].aio_buf);
}
free(aiocb_list);
free(aiocb_list_ptr);
// free(memchnk);
if (failCount > 0) {
return 1;
}
}