340 lines
9.3 KiB
C
340 lines
9.3 KiB
C
|
|
// retoor <retoor@molodetz.nl>
|
|||
|
|
#include "json_repair.h"
|
|||
|
|
#include <stdio.h>
|
|||
|
|
#include <stdlib.h>
|
|||
|
|
#include <string.h>
|
|||
|
|
#include <stdbool.h>
|
|||
|
|
#include <ctype.h>
|
|||
|
|
static char *strip_comments(const char *src) {
|
|||
|
|
if (!src) return NULL;
|
|||
|
|
size_t len = strlen(src);
|
|||
|
|
char *result = malloc(len + 1);
|
|||
|
|
if (!result) return NULL;
|
|||
|
|
char *dst = result;
|
|||
|
|
const char *p = src;
|
|||
|
|
bool in_string = false;
|
|||
|
|
bool escaped = false;
|
|||
|
|
while (*p) {
|
|||
|
|
if (escaped) {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = false;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '\\') {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = true;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '"') {
|
|||
|
|
in_string = !in_string;
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (!in_string) {
|
|||
|
|
if (*p == '/' && *(p + 1) == '/') {
|
|||
|
|
while (*p && *p != '\n') p++;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '/' && *(p + 1) == '*') {
|
|||
|
|
p += 2;
|
|||
|
|
while (*p && !(*p == '*' && *(p + 1) == '/')) p++;
|
|||
|
|
if (*p) p += 2;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '#') {
|
|||
|
|
while (*p && *p != '\n') p++;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
}
|
|||
|
|
*dst = '\0';
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
static char *normalize_quotes(const char *src) {
|
|||
|
|
if (!src) return NULL;
|
|||
|
|
size_t len = strlen(src);
|
|||
|
|
// Over-allocate because single quotes might be replaced by double quotes + escaping
|
|||
|
|
char *result = malloc(len * 2 + 1);
|
|||
|
|
if (!result) return NULL;
|
|||
|
|
char *dst = result;
|
|||
|
|
const char *p = src;
|
|||
|
|
bool in_double_string = false;
|
|||
|
|
bool escaped = false;
|
|||
|
|
while (*p) {
|
|||
|
|
// Smart quote replacement
|
|||
|
|
if ((unsigned char)*p == 0xE2 && (unsigned char)*(p+1) == 0x80) {
|
|||
|
|
if ((unsigned char)*(p+2) == 0x9C || (unsigned char)*(p+2) == 0x9D) { // “ or ”
|
|||
|
|
*dst++ = '"';
|
|||
|
|
p += 3;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if ((unsigned char)*(p+2) == 0x98 || (unsigned char)*(p+2) == 0x99) { // ‘ or ’
|
|||
|
|
*dst++ = '\'';
|
|||
|
|
p += 3;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (escaped) {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = false;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '\\') {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = true;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '"') {
|
|||
|
|
in_double_string = !in_double_string;
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (!in_double_string && *p == '\'') {
|
|||
|
|
// Heuristic: convert '...' to "..."
|
|||
|
|
*dst++ = '"';
|
|||
|
|
p++;
|
|||
|
|
while (*p && *p != '\'') {
|
|||
|
|
if (*p == '\\' && *(p+1)) {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
} else if (*p == '"') {
|
|||
|
|
*dst++ = '\\';
|
|||
|
|
*dst++ = '"';
|
|||
|
|
p++;
|
|||
|
|
} else {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (*p == '\'') {
|
|||
|
|
*dst++ = '"';
|
|||
|
|
p++;
|
|||
|
|
}
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
}
|
|||
|
|
*dst = '\0';
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
static char *remove_trailing_commas(const char *src) {
|
|||
|
|
if (!src) return NULL;
|
|||
|
|
size_t len = strlen(src);
|
|||
|
|
char *result = malloc(len + 1);
|
|||
|
|
if (!result) return NULL;
|
|||
|
|
char *dst = result;
|
|||
|
|
const char *p = src;
|
|||
|
|
bool in_string = false;
|
|||
|
|
bool escaped = false;
|
|||
|
|
while (*p) {
|
|||
|
|
if (escaped) {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = false;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '\\') {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = true;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '"') {
|
|||
|
|
in_string = !in_string;
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (!in_string && *p == ',') {
|
|||
|
|
// Check if next non-ws char is ] or }
|
|||
|
|
const char *next = p + 1;
|
|||
|
|
while (*next && isspace((unsigned char)*next)) next++;
|
|||
|
|
if (*next == ']' || *next == '}') {
|
|||
|
|
p = next; // Skip the comma
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
}
|
|||
|
|
*dst = '\0';
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
static char *quote_unquoted_keys(const char *src) {
|
|||
|
|
if (!src) return NULL;
|
|||
|
|
size_t len = strlen(src);
|
|||
|
|
char *result = malloc(len * 2 + 1);
|
|||
|
|
if (!result) return NULL;
|
|||
|
|
char *dst = result;
|
|||
|
|
const char *p = src;
|
|||
|
|
bool in_string = false;
|
|||
|
|
bool escaped = false;
|
|||
|
|
while (*p) {
|
|||
|
|
if (escaped) {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = false;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '\\') {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = true;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '"') {
|
|||
|
|
in_string = !in_string;
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (!in_string && (isalnum((unsigned char)*p) || *p == '_' || *p == '-')) {
|
|||
|
|
// Potential unquoted key?
|
|||
|
|
// A key usually follows '{' or ',' and is followed by ':'
|
|||
|
|
// Heuristic: if we are at start of an identifier, check if it ends with ':'
|
|||
|
|
|
|||
|
|
// Check backwards for { or ,
|
|||
|
|
const char *prev = p - 1;
|
|||
|
|
while (prev >= src && isspace((unsigned char)*prev)) prev--;
|
|||
|
|
|
|||
|
|
if (prev >= src && (*prev == '{' || *prev == ',')) {
|
|||
|
|
const char *end = p;
|
|||
|
|
while (*end && (isalnum((unsigned char)*end) || *end == '_' || *end == '-')) end++;
|
|||
|
|
const char *after = end;
|
|||
|
|
while (*after && isspace((unsigned char)*after)) after++;
|
|||
|
|
|
|||
|
|
if (*after == ':') {
|
|||
|
|
// It is an unquoted key!
|
|||
|
|
*dst++ = '"';
|
|||
|
|
while (p < end) *dst++ = *p++;
|
|||
|
|
*dst++ = '"';
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
}
|
|||
|
|
*dst = '\0';
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
static char *balance_brackets(const char *src) {
|
|||
|
|
if (!src) return NULL;
|
|||
|
|
size_t len = strlen(src);
|
|||
|
|
char *result = malloc(len + 1024);
|
|||
|
|
if (!result) return NULL;
|
|||
|
|
char stack[1024];
|
|||
|
|
int top = 0;
|
|||
|
|
|
|||
|
|
char *dst = result;
|
|||
|
|
const char *p = src;
|
|||
|
|
bool in_string = false;
|
|||
|
|
bool escaped = false;
|
|||
|
|
while (*p) {
|
|||
|
|
if (escaped) {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = false;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '\\') {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = true;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '"') {
|
|||
|
|
in_string = !in_string;
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (!in_string) {
|
|||
|
|
if (*p == '{' || *p == '[') {
|
|||
|
|
if (top < 1024) stack[top++] = *p;
|
|||
|
|
} else if (*p == '}' || *p == ']') {
|
|||
|
|
if (top > 0) {
|
|||
|
|
char expected = (*p == '}') ? '{' : '[';
|
|||
|
|
if (stack[top - 1] == expected) {
|
|||
|
|
top--;
|
|||
|
|
}
|
|||
|
|
} else {
|
|||
|
|
// Mismatched closing; skip it
|
|||
|
|
p++;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
}
|
|||
|
|
while (top > 0) {
|
|||
|
|
char opener = stack[--top];
|
|||
|
|
*dst++ = (opener == '{') ? '}' : ']';
|
|||
|
|
}
|
|||
|
|
*dst = '\0';
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
static char *compact_json(const char *src) {
|
|||
|
|
if (!src) return NULL;
|
|||
|
|
size_t len = strlen(src);
|
|||
|
|
char *result = malloc(len + 1);
|
|||
|
|
if (!result) return NULL;
|
|||
|
|
char *dst = result;
|
|||
|
|
const char *p = src;
|
|||
|
|
bool in_string = false;
|
|||
|
|
bool escaped = false;
|
|||
|
|
while (*p) {
|
|||
|
|
if (escaped) {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = false;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '\\') {
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
escaped = true;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (*p == '"') {
|
|||
|
|
in_string = !in_string;
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
if (!in_string && isspace((unsigned char)*p)) {
|
|||
|
|
p++;
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
*dst++ = *p++;
|
|||
|
|
}
|
|||
|
|
*dst = '\0';
|
|||
|
|
return result;
|
|||
|
|
}
|
|||
|
|
char *json_repair_string(const char *src) {
|
|||
|
|
if (!src) return NULL;
|
|||
|
|
// Find the first occurrence of { or [
|
|||
|
|
const char *start_ptr = src;
|
|||
|
|
while (*start_ptr && *start_ptr != '{' && *start_ptr != '[') start_ptr++;
|
|||
|
|
if (!*start_ptr) return strdup(src); // No JSON structure found, return as is
|
|||
|
|
char *s1 = strip_comments(start_ptr);
|
|||
|
|
char *s2 = normalize_quotes(s1);
|
|||
|
|
free(s1);
|
|||
|
|
char *s3 = quote_unquoted_keys(s2);
|
|||
|
|
free(s2);
|
|||
|
|
char *s4 = remove_trailing_commas(s3);
|
|||
|
|
free(s3);
|
|||
|
|
char *s5 = balance_brackets(s4);
|
|||
|
|
free(s4);
|
|||
|
|
// Heuristic: truncate after the first complete object/array
|
|||
|
|
int depth = 0;
|
|||
|
|
bool in_str = false;
|
|||
|
|
bool esc = false;
|
|||
|
|
char *p = s5;
|
|||
|
|
while (*p) {
|
|||
|
|
if (esc) { esc = false; }
|
|||
|
|
else if (*p == '\\') { esc = true; }
|
|||
|
|
else if (*p == '"') { in_str = !in_str; }
|
|||
|
|
else if (!in_str) {
|
|||
|
|
if (*p == '{' || *p == '[') depth++;
|
|||
|
|
else if (*p == '}' || *p == ']') {
|
|||
|
|
depth--;
|
|||
|
|
if (depth == 0) {
|
|||
|
|
*(p + 1) = '\0';
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
p++;
|
|||
|
|
}
|
|||
|
|
char *s6 = compact_json(s5);
|
|||
|
|
free(s5);
|
|||
|
|
return s6;
|
|||
|
|
}
|