r/json_repair.c at bd9b1b929e8c13b4b5ffd24a1df763bc4c3ce4bb

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

 // retoor <retoor@molodetz.nl>
 #include "json_repair.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdbool.h>
 #include <ctype.h>
 static char *strip_comments(const char *src) {
     if (!src) return NULL;
     size_t len = strlen(src);
     char *result = malloc(len + 1);
     if (!result) return NULL;
     char *dst = result;
     const char *p = src;
     bool in_string = false;
     bool escaped = false;
     while (*p) {
         if (escaped) {
             *dst++ = *p++;
             escaped = false;
             continue;
         }
         if (*p == '\\') {
             *dst++ = *p++;
             escaped = true;
             continue;
         }
         if (*p == '"') {
             in_string = !in_string;
             *dst++ = *p++;
             continue;
         }
         if (!in_string) {
             if (*p == '/' && *(p + 1) == '/') {
                 while (*p && *p != '\n') p++;
                 continue;
             }
             if (*p == '/' && *(p + 1) == '*') {
                 p += 2;
                 while (*p && !(*p == '*' && *(p + 1) == '/')) p++;
                 if (*p) p += 2;
                 continue;
             }
             if (*p == '#') {
                 while (*p && *p != '\n') p++;
                 continue;
             }
         }
         *dst++ = *p++;
     }
     *dst = '\0';
     return result;
 }
 static char *normalize_quotes(const char *src) {
     if (!src) return NULL;
     size_t len = strlen(src);
     // Over-allocate because single quotes might be replaced by double quotes + escaping
     char *result = malloc(len * 2 + 1);
     if (!result) return NULL;
     char *dst = result;
     const char *p = src;
     bool in_double_string = false;
     bool escaped = false;
     while (*p) {
         // Smart quote replacement
         if ((unsigned char)*p == 0xE2 && (unsigned char)*(p+1) == 0x80) {
             if ((unsigned char)*(p+2) == 0x9C || (unsigned char)*(p+2) == 0x9D) { // “ or ”
                 *dst++ = '"';
                 p += 3;
                 continue;
             }
             if ((unsigned char)*(p+2) == 0x98 || (unsigned char)*(p+2) == 0x99) { // ‘ or ’
                 *dst++ = '\'';
                 p += 3;
                 continue;
             }
         }
         if (escaped) {
             *dst++ = *p++;
             escaped = false;
             continue;
         }
         if (*p == '\\') {
             *dst++ = *p++;
             escaped = true;
             continue;
         }
         if (*p == '"') {
             in_double_string = !in_double_string;
             *dst++ = *p++;
             continue;
         }
         if (!in_double_string && *p == '\'') {
             // Heuristic: convert '...' to "..."
             *dst++ = '"';
             p++;
             while (*p && *p != '\'') {
                 if (*p == '\\' && *(p+1)) {
                     *dst++ = *p++;
                     *dst++ = *p++;
                 } else if (*p == '"') {
                     *dst++ = '\\';
                     *dst++ = '"';
                     p++;
                 } else {
                     *dst++ = *p++;
                 }
             }
             if (*p == '\'') {
                 *dst++ = '"';
                 p++;
             }
             continue;
         }
         *dst++ = *p++;
     }
     *dst = '\0';
     return result;
 }
 static char *remove_trailing_commas(const char *src) {
     if (!src) return NULL;
     size_t len = strlen(src);
     char *result = malloc(len + 1);
     if (!result) return NULL;
     char *dst = result;
     const char *p = src;
     bool in_string = false;
     bool escaped = false;
     while (*p) {
         if (escaped) {
             *dst++ = *p++;
             escaped = false;
             continue;
         }
         if (*p == '\\') {
             *dst++ = *p++;
             escaped = true;
             continue;
         }
         if (*p == '"') {
             in_string = !in_string;
             *dst++ = *p++;
             continue;
         }
         if (!in_string && *p == ',') {
             // Check if next non-ws char is ] or }
             const char *next = p + 1;
             while (*next && isspace((unsigned char)*next)) next++;
             if (*next == ']' || *next == '}') {
                 p = next; // Skip the comma
                 continue;
             }
         }
         *dst++ = *p++;
     }
     *dst = '\0';
     return result;
 }
 static char *quote_unquoted_keys(const char *src) {
     if (!src) return NULL;
     size_t len = strlen(src);
     char *result = malloc(len * 2 + 1);
     if (!result) return NULL;
     char *dst = result;
     const char *p = src;
     bool in_string = false;
     bool escaped = false;
     while (*p) {
         if (escaped) {
             *dst++ = *p++;
             escaped = false;
             continue;
         }
         if (*p == '\\') {
             *dst++ = *p++;
             escaped = true;
             continue;
         }
         if (*p == '"') {
             in_string = !in_string;
             *dst++ = *p++;
             continue;
         }
         if (!in_string && (isalnum((unsigned char)*p) || *p == '_' || *p == '-')) {
             // Potential unquoted key?
             // A key usually follows '{' or ',' and is followed by ':'
             // Heuristic: if we are at start of an identifier, check if it ends with ':'
             // Check backwards for { or ,
             const char *prev = p - 1;
             while (prev >= src && isspace((unsigned char)*prev)) prev--;
             if (prev >= src && (*prev == '{' || *prev == ',')) {
                 const char *end = p;
                 while (*end && (isalnum((unsigned char)*end) || *end == '_' || *end == '-')) end++;
                 const char *after = end;
                 while (*after && isspace((unsigned char)*after)) after++;
                 if (*after == ':') {
                     // It is an unquoted key!
                     *dst++ = '"';
                     while (p < end) *dst++ = *p++;
                     *dst++ = '"';
                     continue;
                 }
             }
         }
         *dst++ = *p++;
     }
     *dst = '\0';
     return result;
 }
 static char *balance_brackets(const char *src) {
     if (!src) return NULL;
     size_t len = strlen(src);
     char *result = malloc(len + 1024);
     if (!result) return NULL;
     char stack[1024];
     int top = 0;
     char *dst = result;
     const char *p = src;
     bool in_string = false;
     bool escaped = false;
     while (*p) {
         if (escaped) {
             *dst++ = *p++;
             escaped = false;
             continue;
         }
         if (*p == '\\') {
             *dst++ = *p++;
             escaped = true;
             continue;
         }
         if (*p == '"') {
             in_string = !in_string;
             *dst++ = *p++;
             continue;
         }
         if (!in_string) {
             if (*p == '{' || *p == '[') {
                 if (top < 1024) stack[top++] = *p;
             } else if (*p == '}' || *p == ']') {
                 if (top > 0) {
                     char expected = (*p == '}') ? '{' : '[';
                     if (stack[top - 1] == expected) {
                         top--;
                     }
                 } else {
                     // Mismatched closing; skip it
                     p++;
                     continue;
                 }
             }
         }
         *dst++ = *p++;
     }
     while (top > 0) {
         char opener = stack[--top];
         *dst++ = (opener == '{') ? '}' : ']';
     }
     *dst = '\0';
     return result;
 }
 static char *compact_json(const char *src) {
     if (!src) return NULL;
     size_t len = strlen(src);
     char *result = malloc(len + 1);
     if (!result) return NULL;
     char *dst = result;
     const char *p = src;
     bool in_string = false;
     bool escaped = false;
     while (*p) {
         if (escaped) {
             *dst++ = *p++;
             escaped = false;
             continue;
         }
         if (*p == '\\') {
             *dst++ = *p++;
             escaped = true;
             continue;
         }
         if (*p == '"') {
             in_string = !in_string;
             *dst++ = *p++;
             continue;
         }
         if (!in_string && isspace((unsigned char)*p)) {
             p++;
             continue;
         }
         *dst++ = *p++;
     }
     *dst = '\0';
     return result;
 }
 char *json_repair_string(const char *src) {
     if (!src) return NULL;
     // Find the first occurrence of { or [
     const char *start_ptr = src;
     while (*start_ptr && *start_ptr != '{' && *start_ptr != '[') start_ptr++;
     if (!*start_ptr) return strdup(src); // No JSON structure found, return as is
     char *s1 = strip_comments(start_ptr);
     char *s2 = normalize_quotes(s1);
     free(s1);
     char *s3 = quote_unquoted_keys(s2);
     free(s2);
     char *s4 = remove_trailing_commas(s3);
     free(s3);
     char *s5 = balance_brackets(s4);
     free(s4);
     // Heuristic: truncate after the first complete object/array
     int depth = 0;
     bool in_str = false;
     bool esc = false;
     char *p = s5;
     while (*p) {
         if (esc) { esc = false; }
         else if (*p == '\\') { esc = true; }
         else if (*p == '"') { in_str = !in_str; }
         else if (!in_str) {
             if (*p == '{' || *p == '[') depth++;
             else if (*p == '}' || *p == ']') {
                 depth--;
                 if (depth == 0) {
                     *(p + 1) = '\0';
                     break;
                 }
             }
         }
         p++;
     }
     char *s6 = compact_json(s5);
     free(s5);
     return s6;
 }