1
1
import base64
2
2
import enum
3
3
import importlib .resources
4
+ import json
4
5
import logging
6
+ import re
5
7
from typing import Any , Dict , Type
6
8
7
- from pydantic import BaseModel
9
+ from pydantic import BaseModel , ValidationError
8
10
11
+ from index .agent .models import AgentLLMOutput
9
12
from index .browser .utils import scale_b64_image
13
+ from index .llm .llm import BaseLLMProvider , Message
10
14
11
15
logger = logging .getLogger (__name__ )
12
16
@@ -91,4 +95,127 @@ def process_model(model):
91
95
92
96
return model_schema
93
97
94
- return process_model (model_class )
98
+ return process_model (model_class )
99
+
100
+
101
+ async def generate_proper_json (llm : BaseLLMProvider , json_str : str ) -> str :
102
+
103
+ prompt = f"""The following JSON string is malformed or has issues. Please correct it while preserving the original structure and content as much as possible.
104
+ Return ONLY the corrected JSON string, without any surrounding text, comments, or markdown. Do not add any explanations.
105
+
106
+ Problematic JSON string:
107
+ { json_str }
108
+ """
109
+
110
+ input_messages = [
111
+ Message (role = "user" , content = prompt )
112
+ ]
113
+
114
+ response = await llm .call (input_messages )
115
+ corrected_json_str = response .content .strip ()
116
+ if corrected_json_str .startswith ("```json" ):
117
+ corrected_json_str = corrected_json_str [7 :]
118
+ if corrected_json_str .endswith ("```" ):
119
+ corrected_json_str = corrected_json_str [:- 3 ]
120
+ return corrected_json_str .strip ()
121
+
122
+
123
+ async def validate_json (raw_llm_response_content : str , llm : BaseLLMProvider , max_retries : int = 3 ) -> AgentLLMOutput :
124
+ """
125
+ Extracts, validates, and parses a JSON string from raw LLM output,
126
+ attempting to fix it if necessary using retries with cleaning and LLM-based correction.
127
+
128
+ Args:
129
+ raw_llm_response_content: The raw string content from the LLM response.
130
+ llm: The LLM provider instance for fixing JSON if needed.
131
+ max_retries: Maximum number of attempts to parse the JSON.
132
+
133
+ Returns:
134
+ An AgentLLMOutput object.
135
+
136
+ Raises:
137
+ ValueError: If the JSON string cannot be parsed or validated after all retries.
138
+ """
139
+ # 1. Regex extraction from raw_llm_response_content
140
+ pattern = r"<output(?:[^>]*)>(.*?)</output(?:[^>]*)>"
141
+ match = re .search (pattern , raw_llm_response_content , re .DOTALL )
142
+
143
+ current_json_str = ""
144
+ if not match :
145
+ # if we couldn't find the <output> tags, it most likely means the <output*> tag is not present in the response
146
+ # remove closing and opening tags just in case
147
+ closing_tag_pattern = r"</output(?:[^>]*)>"
148
+ json_str_no_closing = re .sub (closing_tag_pattern , "" , raw_llm_response_content ).strip ()
149
+ open_tag_pattern = r"<output(?:[^>]*)>"
150
+ json_str_no_tags = re .sub (open_tag_pattern , "" , json_str_no_closing ).strip ()
151
+ # Also remove potential markdown code blocks if not already handled by regex
152
+ current_json_str = json_str_no_tags .replace ("```json" , "" ).replace ("```" , "" ).strip ()
153
+ else :
154
+ current_json_str = match .group (1 ).strip ()
155
+
156
+ last_exception = None
157
+
158
+ for attempt in range (max_retries ):
159
+ logger .debug (f"JSON parsing attempt { attempt + 1 } /{ max_retries } " )
160
+
161
+ # Stage 1: Try to parse the current_json_str as is
162
+ try :
163
+ # Remove potential markdown that might have been added by LLM fix
164
+ temp_json_str = current_json_str
165
+ if temp_json_str .startswith ("```json" ):
166
+ temp_json_str = temp_json_str [7 :]
167
+ if temp_json_str .endswith ("```" ):
168
+ temp_json_str = temp_json_str [:- 3 ]
169
+ temp_json_str = temp_json_str .strip ()
170
+
171
+ logger .debug (f"Attempting to parse JSON on attempt { attempt + 1 } . Raw JSON: '{ temp_json_str } '" )
172
+ output = AgentLLMOutput .model_validate_json (temp_json_str )
173
+ logger .debug (f"Successfully parsed JSON on attempt { attempt + 1 } ." )
174
+ return output
175
+ except (json .JSONDecodeError , ValidationError ) as e1 :
176
+ logger .warning (f"Direct JSON parsing failed on attempt { attempt + 1 } : { e1 } " )
177
+ last_exception = e1
178
+
179
+ # Stage 2: Try to parse after cleaning common issues
180
+ try :
181
+ json_str_cleaned = current_json_str # Start with the current_json_str for cleaning
182
+ # Removed explicit replacement of \n, \r, \t - rely on JSON parser
183
+ # json_str_cleaned = json_str_cleaned.replace('\\\\n', '\n').replace('\\\\r', '\r').replace('\\\\t', '\t')
184
+ # Keep control character removal
185
+ json_str_cleaned = re .sub (r'[\x00-\x08\x0B\x0C\x0E-\x1F]' , '' , json_str_cleaned )
186
+
187
+ if json_str_cleaned .startswith ("```json" ):
188
+ json_str_cleaned = json_str_cleaned [7 :]
189
+ if json_str_cleaned .endswith ("```" ):
190
+ json_str_cleaned = json_str_cleaned [:- 3 ]
191
+ json_str_cleaned = json_str_cleaned .strip ()
192
+
193
+ logger .debug (f"Attempting to parse cleaned JSON on attempt { attempt + 1 } . Cleaned JSON: '{ json_str_cleaned [:250 ]} ...'" )
194
+ output = AgentLLMOutput .model_validate_json (json_str_cleaned )
195
+ logger .debug (f"Successfully parsed JSON on attempt { attempt + 1 } (after cleaning)." )
196
+ return output
197
+ except (json .JSONDecodeError , ValidationError ) as e2 :
198
+ logger .warning (f"Cleaned JSON parsing failed on attempt { attempt + 1 } : { e2 } " )
199
+ last_exception = e2
200
+
201
+ if attempt < max_retries - 1 :
202
+ logger .debug (f"Attempt { attempt + 1 } failed. Attempting to fix JSON with LLM." )
203
+ try :
204
+ # Pass the original problematic string (before this attempt's cleaning) to LLM
205
+ current_json_str = await generate_proper_json (llm , current_json_str )
206
+ logger .debug (f"LLM proposed a new JSON string: '{ current_json_str } '" )
207
+ except Exception as llm_fix_exception :
208
+ logger .error (f"LLM call to fix JSON failed during attempt { attempt + 1 } : { llm_fix_exception } " )
209
+ # If LLM fix fails, loop continues with the previous current_json_str,
210
+ # and will eventually fail if parsing doesn't succeed.
211
+ pass
212
+ else :
213
+ logger .error (f"All { max_retries } attempts to parse JSON failed. Final attempt was with: '{ current_json_str [:250 ]} ...'" )
214
+ break
215
+
216
+ raise ValueError (
217
+ f"Could not parse or validate response after { max_retries } attempts. "
218
+ f"Last error: { str (last_exception )} \\ n"
219
+ f"Final problematic JSON string after all attempts: '{ current_json_str [:500 ]} ...'"
220
+ ) from last_exception
221
+
0 commit comments