// retoor <retoor@molodetz.nl>
# include "json_repair.h"
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include <stdbool.h>
# include <ctype.h>
static char * strip_comments ( const char * src ) {
if ( ! src ) return NULL ;
size_t len = strlen ( src ) ;
char * result = malloc ( len + 1 ) ;
if ( ! result ) return NULL ;
char * dst = result ;
const char * p = src ;
bool in_string = false ;
bool escaped = false ;
while ( * p ) {
if ( escaped ) {
* dst + + = * p + + ;
escaped = false ;
continue ;
}
if ( * p = = ' \\ ' ) {
* dst + + = * p + + ;
escaped = true ;
continue ;
}
if ( * p = = ' " ' ) {
in_string = ! in_string ;
* dst + + = * p + + ;
continue ;
}
if ( ! in_string ) {
if ( * p = = ' / ' & & * ( p + 1 ) = = ' / ' ) {
while ( * p & & * p ! = ' \n ' ) p + + ;
continue ;
}
if ( * p = = ' / ' & & * ( p + 1 ) = = ' * ' ) {
p + = 2 ;
while ( * p & & ! ( * p = = ' * ' & & * ( p + 1 ) = = ' / ' ) ) p + + ;
if ( * p ) p + = 2 ;
continue ;
}
if ( * p = = ' # ' ) {
while ( * p & & * p ! = ' \n ' ) p + + ;
continue ;
}
}
* dst + + = * p + + ;
}
* dst = ' \0 ' ;
return result ;
}
static char * normalize_quotes ( const char * src ) {
if ( ! src ) return NULL ;
size_t len = strlen ( src ) ;
// Over-allocate because single quotes might be replaced by double quotes + escaping
char * result = malloc ( len * 2 + 1 ) ;
if ( ! result ) return NULL ;
char * dst = result ;
const char * p = src ;
bool in_double_string = false ;
bool escaped = false ;
while ( * p ) {
// Smart quote replacement
if ( ( unsigned char ) * p = = 0xE2 & & ( unsigned char ) * ( p + 1 ) = = 0x80 ) {
if ( ( unsigned char ) * ( p + 2 ) = = 0x9C | | ( unsigned char ) * ( p + 2 ) = = 0x9D ) { // “ or ”
* dst + + = ' " ' ;
p + = 3 ;
continue ;
}
if ( ( unsigned char ) * ( p + 2 ) = = 0x98 | | ( unsigned char ) * ( p + 2 ) = = 0x99 ) { // ‘ or ’
* dst + + = ' \' ' ;
p + = 3 ;
continue ;
}
}
if ( escaped ) {
* dst + + = * p + + ;
escaped = false ;
continue ;
}
if ( * p = = ' \\ ' ) {
* dst + + = * p + + ;
escaped = true ;
continue ;
}
if ( * p = = ' " ' ) {
in_double_string = ! in_double_string ;
* dst + + = * p + + ;
continue ;
}
if ( ! in_double_string & & * p = = ' \' ' ) {
// Heuristic: convert '...' to "..."
* dst + + = ' " ' ;
p + + ;
while ( * p & & * p ! = ' \' ' ) {
if ( * p = = ' \\ ' & & * ( p + 1 ) ) {
* dst + + = * p + + ;
* dst + + = * p + + ;
} else if ( * p = = ' " ' ) {
* dst + + = ' \\ ' ;
* dst + + = ' " ' ;
p + + ;
} else {
* dst + + = * p + + ;
}
}
if ( * p = = ' \' ' ) {
* dst + + = ' " ' ;
p + + ;
}
continue ;
}
* dst + + = * p + + ;
}
* dst = ' \0 ' ;
return result ;
}
static char * remove_trailing_commas ( const char * src ) {
if ( ! src ) return NULL ;
size_t len = strlen ( src ) ;
char * result = malloc ( len + 1 ) ;
if ( ! result ) return NULL ;
char * dst = result ;
const char * p = src ;
bool in_string = false ;
bool escaped = false ;
while ( * p ) {
if ( escaped ) {
* dst + + = * p + + ;
escaped = false ;
continue ;
}
if ( * p = = ' \\ ' ) {
* dst + + = * p + + ;
escaped = true ;
continue ;
}
if ( * p = = ' " ' ) {
in_string = ! in_string ;
* dst + + = * p + + ;
continue ;
}
if ( ! in_string & & * p = = ' , ' ) {
// Check if next non-ws char is ] or }
const char * next = p + 1 ;
while ( * next & & isspace ( ( unsigned char ) * next ) ) next + + ;
if ( * next = = ' ] ' | | * next = = ' } ' ) {
p = next ; // Skip the comma
continue ;
}
}
* dst + + = * p + + ;
}
* dst = ' \0 ' ;
return result ;
}
static char * quote_unquoted_keys ( const char * src ) {
if ( ! src ) return NULL ;
size_t len = strlen ( src ) ;
char * result = malloc ( len * 2 + 1 ) ;
if ( ! result ) return NULL ;
char * dst = result ;
const char * p = src ;
bool in_string = false ;
bool escaped = false ;
while ( * p ) {
if ( escaped ) {
* dst + + = * p + + ;
escaped = false ;
continue ;
}
if ( * p = = ' \\ ' ) {
* dst + + = * p + + ;
escaped = true ;
continue ;
}
if ( * p = = ' " ' ) {
in_string = ! in_string ;
* dst + + = * p + + ;
continue ;
}
if ( ! in_string & & ( isalnum ( ( unsigned char ) * p ) | | * p = = ' _ ' | | * p = = ' - ' ) ) {
// Potential unquoted key?
// A key usually follows '{' or ',' and is followed by ':'
// Heuristic: if we are at start of an identifier, check if it ends with ':'
// Check backwards for { or ,
const char * prev = p - 1 ;
while ( prev > = src & & isspace ( ( unsigned char ) * prev ) ) prev - - ;
if ( prev > = src & & ( * prev = = ' { ' | | * prev = = ' , ' ) ) {
const char * end = p ;
while ( * end & & ( isalnum ( ( unsigned char ) * end ) | | * end = = ' _ ' | | * end = = ' - ' ) ) end + + ;
const char * after = end ;
while ( * after & & isspace ( ( unsigned char ) * after ) ) after + + ;
if ( * after = = ' : ' ) {
// It is an unquoted key!
* dst + + = ' " ' ;
while ( p < end ) * dst + + = * p + + ;
* dst + + = ' " ' ;
continue ;
}
}
}
* dst + + = * p + + ;
}
* dst = ' \0 ' ;
return result ;
}
static char * balance_brackets ( const char * src ) {
if ( ! src ) return NULL ;
size_t len = strlen ( src ) ;
char * result = malloc ( len + 1024 ) ;
if ( ! result ) return NULL ;
char stack [ 1024 ] ;
int top = 0 ;
char * dst = result ;
const char * p = src ;
bool in_string = false ;
bool escaped = false ;
while ( * p ) {
if ( escaped ) {
* dst + + = * p + + ;
escaped = false ;
continue ;
}
if ( * p = = ' \\ ' ) {
* dst + + = * p + + ;
escaped = true ;
continue ;
}
if ( * p = = ' " ' ) {
in_string = ! in_string ;
* dst + + = * p + + ;
continue ;
}
if ( ! in_string ) {
if ( * p = = ' { ' | | * p = = ' [ ' ) {
if ( top < 1024 ) stack [ top + + ] = * p ;
} else if ( * p = = ' } ' | | * p = = ' ] ' ) {
if ( top > 0 ) {
char expected = ( * p = = ' } ' ) ? ' { ' : ' [ ' ;
if ( stack [ top - 1 ] = = expected ) {
top - - ;
}
} else {
// Mismatched closing; skip it
p + + ;
continue ;
}
}
}
* dst + + = * p + + ;
}
while ( top > 0 ) {
char opener = stack [ - - top ] ;
* dst + + = ( opener = = ' { ' ) ? ' } ' : ' ] ' ;
}
* dst = ' \0 ' ;
return result ;
}
static char * compact_json ( const char * src ) {
if ( ! src ) return NULL ;
size_t len = strlen ( src ) ;
char * result = malloc ( len + 1 ) ;
if ( ! result ) return NULL ;
char * dst = result ;
const char * p = src ;
bool in_string = false ;
bool escaped = false ;
while ( * p ) {
if ( escaped ) {
* dst + + = * p + + ;
escaped = false ;
continue ;
}
if ( * p = = ' \\ ' ) {
* dst + + = * p + + ;
escaped = true ;
continue ;
}
if ( * p = = ' " ' ) {
in_string = ! in_string ;
* dst + + = * p + + ;
continue ;
}
if ( ! in_string & & isspace ( ( unsigned char ) * p ) ) {
p + + ;
continue ;
}
* dst + + = * p + + ;
}
* dst = ' \0 ' ;
return result ;
}
char * json_repair_string ( const char * src ) {
if ( ! src ) return NULL ;
// Find the first occurrence of { or [
const char * start_ptr = src ;
while ( * start_ptr & & * start_ptr ! = ' { ' & & * start_ptr ! = ' [ ' ) start_ptr + + ;
if ( ! * start_ptr ) return strdup ( src ) ; // No JSON structure found, return as is
char * s1 = strip_comments ( start_ptr ) ;
char * s2 = normalize_quotes ( s1 ) ;
free ( s1 ) ;
char * s3 = quote_unquoted_keys ( s2 ) ;
free ( s2 ) ;
char * s4 = remove_trailing_commas ( s3 ) ;
free ( s3 ) ;
char * s5 = balance_brackets ( s4 ) ;
free ( s4 ) ;
// Heuristic: truncate after the first complete object/array
int depth = 0 ;
bool in_str = false ;
bool esc = false ;
char * p = s5 ;
while ( * p ) {
if ( esc ) { esc = false ; }
else if ( * p = = ' \\ ' ) { esc = true ; }
else if ( * p = = ' " ' ) { in_str = ! in_str ; }
else if ( ! in_str ) {
if ( * p = = ' { ' | | * p = = ' [ ' ) depth + + ;
else if ( * p = = ' } ' | | * p = = ' ] ' ) {
depth - - ;
if ( depth = = 0 ) {
* ( p + 1 ) = ' \0 ' ;
break ;
}
}
}
p + + ;
}
char * s6 = compact_json ( s5 ) ;
free ( s5 ) ;
return s6 ;
}