340 lines
9.3 KiB
C
Raw Normal View History

2026-01-29 00:38:21 +01:00
// retoor <retoor@molodetz.nl>
#include "json_repair.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <ctype.h>
static char *strip_comments(const char *src) {
if (!src) return NULL;
size_t len = strlen(src);
char *result = malloc(len + 1);
if (!result) return NULL;
char *dst = result;
const char *p = src;
bool in_string = false;
bool escaped = false;
while (*p) {
if (escaped) {
*dst++ = *p++;
escaped = false;
continue;
}
if (*p == '\\') {
*dst++ = *p++;
escaped = true;
continue;
}
if (*p == '"') {
in_string = !in_string;
*dst++ = *p++;
continue;
}
if (!in_string) {
if (*p == '/' && *(p + 1) == '/') {
while (*p && *p != '\n') p++;
continue;
}
if (*p == '/' && *(p + 1) == '*') {
p += 2;
while (*p && !(*p == '*' && *(p + 1) == '/')) p++;
if (*p) p += 2;
continue;
}
if (*p == '#') {
while (*p && *p != '\n') p++;
continue;
}
}
*dst++ = *p++;
}
*dst = '\0';
return result;
}
static char *normalize_quotes(const char *src) {
if (!src) return NULL;
size_t len = strlen(src);
// Over-allocate because single quotes might be replaced by double quotes + escaping
char *result = malloc(len * 2 + 1);
if (!result) return NULL;
char *dst = result;
const char *p = src;
bool in_double_string = false;
bool escaped = false;
while (*p) {
// Smart quote replacement
if ((unsigned char)*p == 0xE2 && (unsigned char)*(p+1) == 0x80) {
if ((unsigned char)*(p+2) == 0x9C || (unsigned char)*(p+2) == 0x9D) { // “ or ”
*dst++ = '"';
p += 3;
continue;
}
if ((unsigned char)*(p+2) == 0x98 || (unsigned char)*(p+2) == 0x99) { // or
*dst++ = '\'';
p += 3;
continue;
}
}
if (escaped) {
*dst++ = *p++;
escaped = false;
continue;
}
if (*p == '\\') {
*dst++ = *p++;
escaped = true;
continue;
}
if (*p == '"') {
in_double_string = !in_double_string;
*dst++ = *p++;
continue;
}
if (!in_double_string && *p == '\'') {
// Heuristic: convert '...' to "..."
*dst++ = '"';
p++;
while (*p && *p != '\'') {
if (*p == '\\' && *(p+1)) {
*dst++ = *p++;
*dst++ = *p++;
} else if (*p == '"') {
*dst++ = '\\';
*dst++ = '"';
p++;
} else {
*dst++ = *p++;
}
}
if (*p == '\'') {
*dst++ = '"';
p++;
}
continue;
}
*dst++ = *p++;
}
*dst = '\0';
return result;
}
static char *remove_trailing_commas(const char *src) {
if (!src) return NULL;
size_t len = strlen(src);
char *result = malloc(len + 1);
if (!result) return NULL;
char *dst = result;
const char *p = src;
bool in_string = false;
bool escaped = false;
while (*p) {
if (escaped) {
*dst++ = *p++;
escaped = false;
continue;
}
if (*p == '\\') {
*dst++ = *p++;
escaped = true;
continue;
}
if (*p == '"') {
in_string = !in_string;
*dst++ = *p++;
continue;
}
if (!in_string && *p == ',') {
// Check if next non-ws char is ] or }
const char *next = p + 1;
while (*next && isspace((unsigned char)*next)) next++;
if (*next == ']' || *next == '}') {
p = next; // Skip the comma
continue;
}
}
*dst++ = *p++;
}
*dst = '\0';
return result;
}
static char *quote_unquoted_keys(const char *src) {
if (!src) return NULL;
size_t len = strlen(src);
char *result = malloc(len * 2 + 1);
if (!result) return NULL;
char *dst = result;
const char *p = src;
bool in_string = false;
bool escaped = false;
while (*p) {
if (escaped) {
*dst++ = *p++;
escaped = false;
continue;
}
if (*p == '\\') {
*dst++ = *p++;
escaped = true;
continue;
}
if (*p == '"') {
in_string = !in_string;
*dst++ = *p++;
continue;
}
if (!in_string && (isalnum((unsigned char)*p) || *p == '_' || *p == '-')) {
// Potential unquoted key?
// A key usually follows '{' or ',' and is followed by ':'
// Heuristic: if we are at start of an identifier, check if it ends with ':'
// Check backwards for { or ,
const char *prev = p - 1;
while (prev >= src && isspace((unsigned char)*prev)) prev--;
if (prev >= src && (*prev == '{' || *prev == ',')) {
const char *end = p;
while (*end && (isalnum((unsigned char)*end) || *end == '_' || *end == '-')) end++;
const char *after = end;
while (*after && isspace((unsigned char)*after)) after++;
if (*after == ':') {
// It is an unquoted key!
*dst++ = '"';
while (p < end) *dst++ = *p++;
*dst++ = '"';
continue;
}
}
}
*dst++ = *p++;
}
*dst = '\0';
return result;
}
static char *balance_brackets(const char *src) {
if (!src) return NULL;
size_t len = strlen(src);
char *result = malloc(len + 1024);
if (!result) return NULL;
char stack[1024];
int top = 0;
char *dst = result;
const char *p = src;
bool in_string = false;
bool escaped = false;
while (*p) {
if (escaped) {
*dst++ = *p++;
escaped = false;
continue;
}
if (*p == '\\') {
*dst++ = *p++;
escaped = true;
continue;
}
if (*p == '"') {
in_string = !in_string;
*dst++ = *p++;
continue;
}
if (!in_string) {
if (*p == '{' || *p == '[') {
if (top < 1024) stack[top++] = *p;
} else if (*p == '}' || *p == ']') {
if (top > 0) {
char expected = (*p == '}') ? '{' : '[';
if (stack[top - 1] == expected) {
top--;
}
} else {
// Mismatched closing; skip it
p++;
continue;
}
}
}
*dst++ = *p++;
}
while (top > 0) {
char opener = stack[--top];
*dst++ = (opener == '{') ? '}' : ']';
}
*dst = '\0';
return result;
}
static char *compact_json(const char *src) {
if (!src) return NULL;
size_t len = strlen(src);
char *result = malloc(len + 1);
if (!result) return NULL;
char *dst = result;
const char *p = src;
bool in_string = false;
bool escaped = false;
while (*p) {
if (escaped) {
*dst++ = *p++;
escaped = false;
continue;
}
if (*p == '\\') {
*dst++ = *p++;
escaped = true;
continue;
}
if (*p == '"') {
in_string = !in_string;
*dst++ = *p++;
continue;
}
if (!in_string && isspace((unsigned char)*p)) {
p++;
continue;
}
*dst++ = *p++;
}
*dst = '\0';
return result;
}
char *json_repair_string(const char *src) {
if (!src) return NULL;
// Find the first occurrence of { or [
const char *start_ptr = src;
while (*start_ptr && *start_ptr != '{' && *start_ptr != '[') start_ptr++;
if (!*start_ptr) return strdup(src); // No JSON structure found, return as is
char *s1 = strip_comments(start_ptr);
char *s2 = normalize_quotes(s1);
free(s1);
char *s3 = quote_unquoted_keys(s2);
free(s2);
char *s4 = remove_trailing_commas(s3);
free(s3);
char *s5 = balance_brackets(s4);
free(s4);
// Heuristic: truncate after the first complete object/array
int depth = 0;
bool in_str = false;
bool esc = false;
char *p = s5;
while (*p) {
if (esc) { esc = false; }
else if (*p == '\\') { esc = true; }
else if (*p == '"') { in_str = !in_str; }
else if (!in_str) {
if (*p == '{' || *p == '[') depth++;
else if (*p == '}' || *p == ']') {
depth--;
if (depth == 0) {
*(p + 1) = '\0';
break;
}
}
}
p++;
}
char *s6 = compact_json(s5);
free(s5);
return s6;
}