fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
486 lines
18 KiB
Python
486 lines
18 KiB
Python
from typing import TYPE_CHECKING, Any, cast
|
|
|
|
from .parser_schema import ObjectSchemaConfig, resolve_parser_object_schema
|
|
from .utils.constants import MISSING_VALUE, STRING_DELIMITERS, JSONReturnType
|
|
from .utils.json_context import ContextValues
|
|
from .utils.pattern_properties import match_pattern_properties
|
|
|
|
if TYPE_CHECKING:
|
|
from .json_parser import JSONParser
|
|
from .schema_repair import SchemaRepairer
|
|
|
|
|
|
def _finalize_object(
|
|
obj: dict[str, JSONReturnType],
|
|
schema_repairer: "SchemaRepairer | None",
|
|
schema_config: ObjectSchemaConfig | None,
|
|
path: str,
|
|
) -> dict[str, JSONReturnType]:
|
|
if schema_repairer is None or schema_config is None:
|
|
return obj
|
|
|
|
missing_required = [key for key in schema_config.required if key not in obj]
|
|
if missing_required and schema_repairer.schema_repair_mode != "salvage":
|
|
raise ValueError(f"Missing required properties at {path}: {', '.join(missing_required)}")
|
|
|
|
for key, prop_schema in schema_config.properties.items():
|
|
if key in obj or key in schema_config.required:
|
|
continue
|
|
if isinstance(prop_schema, dict) and "default" in prop_schema:
|
|
obj[key] = schema_repairer._copy_json_value(prop_schema["default"], f"{path}.{key}", "default")
|
|
schema_repairer._log("Inserted default value for missing property", f"{path}.{key}")
|
|
return obj
|
|
|
|
|
|
def _strip_comments_for_empty_object_classification(body: str) -> str:
|
|
stripped = []
|
|
in_quote: str | None = None
|
|
backslashes = 0
|
|
index = 0
|
|
while index < len(body):
|
|
char = body[index]
|
|
next_char = body[index + 1] if index + 1 < len(body) else ""
|
|
|
|
if char == "\\":
|
|
backslashes += 1
|
|
stripped.append(char)
|
|
index += 1
|
|
continue
|
|
if in_quote is not None:
|
|
stripped.append(char)
|
|
if char == in_quote and backslashes % 2 == 0:
|
|
in_quote = None
|
|
backslashes = 0
|
|
index += 1
|
|
continue
|
|
if char in STRING_DELIMITERS and backslashes % 2 == 0:
|
|
in_quote = char
|
|
stripped.append(char)
|
|
backslashes = 0
|
|
index += 1
|
|
continue
|
|
backslashes = 0
|
|
|
|
if char == "#" or (char == "/" and next_char == "/"):
|
|
index += 2 if char == "/" else 1
|
|
while index < len(body) and body[index] not in ["\n", "\r"]:
|
|
index += 1
|
|
continue
|
|
if char == "/" and next_char == "*":
|
|
index += 2
|
|
while index < len(body) - 1 and body[index : index + 2] != "*/":
|
|
index += 1
|
|
index = min(index + 2, len(body))
|
|
continue
|
|
|
|
stripped.append(char)
|
|
index += 1
|
|
|
|
return "".join(stripped)
|
|
|
|
|
|
def _classify_empty_object_repair(
|
|
self: "JSONParser",
|
|
start_index: int,
|
|
schema: dict[str, Any] | bool | None,
|
|
schema_repairer: "SchemaRepairer | None",
|
|
) -> tuple[str, str | None]:
|
|
attempted_object = self.json_str[start_index - 1 : self.index + 1]
|
|
body = attempted_object[1:]
|
|
body = body.removesuffix("}")
|
|
body = body.lstrip()
|
|
if not body:
|
|
return "keep", None
|
|
if (body.startswith('\\"') and '\\":' in body) or (body.startswith("\\'") and "\\':" in body):
|
|
normalized_object = attempted_object.replace('\\"', '"').replace("\\'", "'")
|
|
self.log(
|
|
"Parsed object is empty but the input starts like an escaped object key, normalizing and reparsing it as an object",
|
|
)
|
|
return "object", normalized_object
|
|
body = _strip_comments_for_empty_object_classification(body).lstrip()
|
|
if not body:
|
|
return "keep", None
|
|
|
|
in_quote: str | None = None
|
|
backslashes = 0
|
|
for char in body:
|
|
if char == "\\":
|
|
backslashes += 1
|
|
continue
|
|
if in_quote is not None:
|
|
if char == in_quote and backslashes % 2 == 0:
|
|
in_quote = None
|
|
elif char in STRING_DELIMITERS and backslashes % 2 == 0:
|
|
in_quote = char
|
|
elif char == ":" and backslashes % 2 == 0:
|
|
self.log(
|
|
"Parsed object is empty but the input still contains an object-style separator, keeping object repair",
|
|
)
|
|
return "keep", None
|
|
backslashes = 0
|
|
if (
|
|
schema_repairer is not None
|
|
and schema_repairer.schema_repair_mode == "salvage"
|
|
and isinstance(schema, dict)
|
|
and schema_repairer.is_object_schema(schema)
|
|
and not schema_repairer.is_array_schema(schema)
|
|
):
|
|
return "schema_set_object", None
|
|
return "array", None
|
|
|
|
|
|
def _merge_object_array_continuation(
|
|
self: "JSONParser",
|
|
obj: dict[str, JSONReturnType],
|
|
) -> bool:
|
|
prev_key = list(obj.keys())[-1] if obj else None
|
|
if not prev_key or not isinstance(obj[prev_key], list) or self.strict:
|
|
return False
|
|
|
|
self.index += 1
|
|
new_array = self.parse_array()
|
|
if isinstance(new_array, list):
|
|
prev_value = obj[prev_key]
|
|
if isinstance(prev_value, list):
|
|
list_lengths = [len(item) for item in prev_value if isinstance(item, list)]
|
|
expected_len = (
|
|
list_lengths[0] if list_lengths and all(length == list_lengths[0] for length in list_lengths) else None
|
|
)
|
|
if expected_len:
|
|
tail = []
|
|
while prev_value and not isinstance(prev_value[-1], list):
|
|
tail.append(prev_value.pop())
|
|
if tail:
|
|
tail.reverse()
|
|
if len(tail) % expected_len == 0:
|
|
self.log(
|
|
"While parsing an object we found row values without an inner array, grouping them into rows",
|
|
)
|
|
for i in range(0, len(tail), expected_len):
|
|
prev_value.append(tail[i : i + expected_len])
|
|
else:
|
|
prev_value.extend(tail)
|
|
if new_array:
|
|
if all(isinstance(item, list) for item in new_array):
|
|
self.log(
|
|
"While parsing an object we found additional rows, appending them without flattening",
|
|
)
|
|
prev_value.extend(new_array)
|
|
else:
|
|
prev_value.append(new_array)
|
|
else:
|
|
prev_value.extend(new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array)
|
|
|
|
self.skip_whitespaces()
|
|
if self.get_char_at() == ",":
|
|
self.index += 1
|
|
self.skip_whitespaces()
|
|
return True
|
|
|
|
|
|
def _parse_object_key(
|
|
self: "JSONParser",
|
|
obj: dict[str, JSONReturnType],
|
|
) -> tuple[str, int]:
|
|
key = ""
|
|
rollback_index = self.index
|
|
self.context.set(ContextValues.OBJECT_KEY)
|
|
try:
|
|
while self.get_char_at():
|
|
rollback_index = self.index
|
|
if self.get_char_at() == "[" and key == "" and _merge_object_array_continuation(self, obj):
|
|
continue
|
|
|
|
raw_key = self.parse_string()
|
|
assert isinstance(raw_key, str)
|
|
key = raw_key
|
|
if key == "":
|
|
self.skip_whitespaces()
|
|
if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
|
|
if key == "" and self.strict:
|
|
self.log(
|
|
"Empty key found in strict mode while parsing object, raising an error",
|
|
)
|
|
raise ValueError("Empty key found in strict mode while parsing object.")
|
|
break
|
|
finally:
|
|
self.context.reset()
|
|
return key, rollback_index
|
|
|
|
|
|
def _should_split_duplicate_object(self: "JSONParser", rollback_index: int) -> bool:
|
|
lookback_idx = rollback_index - self.index - 1
|
|
prev_non_whitespace = self.get_char_at(lookback_idx)
|
|
while prev_non_whitespace and prev_non_whitespace.isspace():
|
|
lookback_idx -= 1
|
|
prev_non_whitespace = self.get_char_at(lookback_idx)
|
|
key_start_char = self.get_char_at(rollback_index - self.index)
|
|
next_non_whitespace = self.get_char_at(self.scroll_whitespaces())
|
|
return not (key_start_char in STRING_DELIMITERS and prev_non_whitespace == "," and next_non_whitespace == ":")
|
|
|
|
|
|
def _split_object_on_duplicate_key(self: "JSONParser", rollback_index: int) -> None:
|
|
self.index = rollback_index - 1
|
|
self.json_str = self.json_str[: self.index + 1] + "{" + self.json_str[self.index + 1 :]
|
|
|
|
|
|
def _resolve_object_property_schema(
|
|
self: "JSONParser",
|
|
schema_repairer: "SchemaRepairer | None",
|
|
schema_config: ObjectSchemaConfig | None,
|
|
key: str,
|
|
) -> tuple[dict[str, Any] | bool | None, list[dict[str, Any] | bool | None], bool]:
|
|
if schema_repairer is None or schema_config is None:
|
|
return None, [], False
|
|
|
|
prop_schema: dict[str, Any] | bool | None = None
|
|
extra_schemas: list[dict[str, Any] | bool | None] = []
|
|
if key in schema_config.properties:
|
|
schema_value = schema_config.properties[key]
|
|
if schema_value is not None and not isinstance(schema_value, (dict, bool)):
|
|
raise ValueError("Schema must be an object.")
|
|
prop_schema = cast("dict[str, Any] | bool | None", schema_value)
|
|
return prop_schema, extra_schemas, False
|
|
|
|
matched: list[Any] = []
|
|
unsupported_patterns: list[str] = []
|
|
if schema_config.pattern_properties:
|
|
matched, unsupported_patterns = match_pattern_properties(schema_config.pattern_properties, key)
|
|
for pattern in unsupported_patterns:
|
|
self.log(
|
|
f"Skipped unsupported patternProperties regex '{pattern}' while parsing object key '{key}'",
|
|
)
|
|
if matched:
|
|
primary_schema = matched[0]
|
|
if primary_schema is not None and not isinstance(primary_schema, (dict, bool)):
|
|
raise ValueError("Schema must be an object.")
|
|
prop_schema = cast("dict[str, Any] | bool | None", primary_schema)
|
|
for extra_schema in matched[1:]:
|
|
if extra_schema is not None and not isinstance(extra_schema, (dict, bool)):
|
|
raise ValueError("Schema must be an object.")
|
|
extra_schemas.append(cast("dict[str, Any] | bool | None", extra_schema))
|
|
return prop_schema, extra_schemas, False
|
|
|
|
if schema_config.additional_properties is False:
|
|
return None, [], True
|
|
if isinstance(schema_config.additional_properties, dict):
|
|
return cast("dict[str, Any]", schema_config.additional_properties), [], False
|
|
return True, [], False
|
|
|
|
|
|
def _parse_object_value(
|
|
self: "JSONParser",
|
|
schema_repairer: "SchemaRepairer | None",
|
|
prop_schema: dict[str, Any] | bool | None,
|
|
key_path: str,
|
|
) -> JSONReturnType:
|
|
self.context.set(ContextValues.OBJECT_VALUE)
|
|
try:
|
|
self.skip_whitespaces()
|
|
char = self.get_char_at()
|
|
if char in [",", "}"]:
|
|
self.log(
|
|
f"While parsing an object value we found a stray {char}, ignoring it",
|
|
)
|
|
if schema_repairer is not None:
|
|
return schema_repairer.repair_value(MISSING_VALUE, prop_schema, key_path)
|
|
return ""
|
|
|
|
if schema_repairer is not None:
|
|
return self.parse_json(prop_schema, key_path)
|
|
return self.parse_json()
|
|
finally:
|
|
self.context.reset()
|
|
|
|
|
|
def _repair_empty_object_result(
|
|
self: "JSONParser",
|
|
obj: dict[str, JSONReturnType],
|
|
start_index: int,
|
|
schema: dict[str, Any] | bool | None,
|
|
path: str,
|
|
schema_repairer: "SchemaRepairer | None",
|
|
) -> tuple[bool, JSONReturnType]:
|
|
if obj or self.index - start_index <= 2:
|
|
return False, None
|
|
|
|
if self.strict:
|
|
self.log(
|
|
"Parsed object is empty but contains extra characters in strict mode, raising an error",
|
|
)
|
|
raise ValueError("Parsed object is empty but contains extra characters in strict mode.")
|
|
|
|
empty_object_repair, normalized_object = _classify_empty_object_repair(self, start_index, schema, schema_repairer)
|
|
if empty_object_repair == "object" and normalized_object is not None:
|
|
end_index = self.index + 1
|
|
self.json_str = self.json_str[: start_index - 1] + normalized_object + self.json_str[end_index:]
|
|
self.index = start_index
|
|
with self.context.enter(ContextValues.OBJECT_KEY):
|
|
repaired_value = self.parse_object(schema, path)
|
|
self.deferred_contexts.append(ContextValues.OBJECT_KEY)
|
|
return True, repaired_value
|
|
if empty_object_repair == "schema_set_object":
|
|
self.log(
|
|
"Parsed object is empty but salvage schema expects an object, reparsing set-like members as null-valued object keys",
|
|
)
|
|
self.index = start_index
|
|
with self.context.enter(ContextValues.OBJECT_KEY):
|
|
set_items = self.parse_array()
|
|
self.deferred_contexts.append(ContextValues.OBJECT_KEY)
|
|
if isinstance(set_items, list):
|
|
key_candidates: list[str] = [item for item in set_items if isinstance(item, str) and item]
|
|
if len(key_candidates) == len(set_items):
|
|
return True, cast("JSONReturnType", dict.fromkeys(key_candidates))
|
|
return True, set_items
|
|
if empty_object_repair == "array":
|
|
self.log("Parsed object is empty, we will try to parse this as an array instead")
|
|
self.index = start_index
|
|
with self.context.enter(ContextValues.OBJECT_KEY):
|
|
repaired_array = self.parse_array()
|
|
self.deferred_contexts.append(ContextValues.OBJECT_KEY)
|
|
return True, repaired_array
|
|
return False, None
|
|
|
|
|
|
def _complete_object_parse(
|
|
self: "JSONParser",
|
|
obj: dict[str, JSONReturnType],
|
|
schema: dict[str, Any] | bool | None,
|
|
path: str,
|
|
schema_repairer: "SchemaRepairer | None",
|
|
schema_config: ObjectSchemaConfig | None,
|
|
) -> JSONReturnType:
|
|
if not self.context.empty:
|
|
if self.get_char_at() == "}" and self.context.current not in [
|
|
ContextValues.OBJECT_KEY,
|
|
ContextValues.OBJECT_VALUE,
|
|
]:
|
|
self.log(
|
|
"Found an extra closing brace that shouldn't be there, skipping it",
|
|
)
|
|
self.index += 1
|
|
return obj
|
|
|
|
self.skip_whitespaces()
|
|
if self.get_char_at() == ",":
|
|
self.index += 1
|
|
self.skip_whitespaces()
|
|
if self.get_char_at() in STRING_DELIMITERS and not self.strict:
|
|
self.log(
|
|
"Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
|
|
)
|
|
additional_obj = self.parse_object(schema, path)
|
|
if isinstance(additional_obj, dict):
|
|
obj.update(additional_obj)
|
|
|
|
return _finalize_object(obj, schema_repairer, schema_config, path)
|
|
|
|
|
|
def parse_object(
|
|
self: "JSONParser",
|
|
schema: dict[str, Any] | bool | None = None,
|
|
path: str = "$",
|
|
) -> JSONReturnType:
|
|
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
|
|
obj: dict[str, JSONReturnType] = {}
|
|
start_index = self.index
|
|
parsing_object_value = self.context.current == ContextValues.OBJECT_VALUE
|
|
schema_repairer, schema, schema_config = resolve_parser_object_schema(self.schema_repairer, schema)
|
|
|
|
while (self.get_char_at() or "}") != "}":
|
|
self.skip_whitespaces()
|
|
|
|
if self.get_char_at() == ":":
|
|
self.log(
|
|
"While parsing an object we found a : before a key, ignoring",
|
|
)
|
|
self.index += 1
|
|
|
|
key, rollback_index = _parse_object_key(self, obj)
|
|
if ContextValues.ARRAY in self.context.context and key in obj:
|
|
if self.strict:
|
|
self.log("Duplicate key found in strict mode while parsing object, raising an error")
|
|
raise ValueError("Duplicate key found in strict mode while parsing object.")
|
|
if not parsing_object_value:
|
|
if _should_split_duplicate_object(self, rollback_index):
|
|
self.log(
|
|
"While parsing an object we found a duplicate key, closing the object here and rolling back the index",
|
|
)
|
|
_split_object_on_duplicate_key(self, rollback_index)
|
|
break
|
|
self.log(
|
|
"While parsing an object we found a duplicate key with a normal comma separator, keeping duplicate-key overwrite behavior",
|
|
)
|
|
|
|
self.skip_whitespaces()
|
|
if (self.get_char_at() or "}") == "}":
|
|
continue
|
|
|
|
self.skip_whitespaces()
|
|
if self.get_char_at() != ":":
|
|
if self.strict:
|
|
self.log(
|
|
"Missing ':' after key in strict mode while parsing object, raising an error",
|
|
)
|
|
raise ValueError("Missing ':' after key in strict mode while parsing object.")
|
|
self.log(
|
|
"While parsing an object we missed a : after a key",
|
|
)
|
|
|
|
self.index += 1
|
|
prop_schema, extra_schemas, drop_property = _resolve_object_property_schema(
|
|
self,
|
|
schema_repairer,
|
|
schema_config,
|
|
key,
|
|
)
|
|
key_path = f"{path}.{key}"
|
|
value = _parse_object_value(self, schema_repairer, prop_schema, key_path)
|
|
|
|
if schema_repairer is not None:
|
|
for extra_schema in extra_schemas:
|
|
value = schema_repairer.repair_value(value, extra_schema, key_path)
|
|
|
|
if schema_repairer is None and value == "" and self.strict and self.get_char_at(-1) not in STRING_DELIMITERS:
|
|
self.log(
|
|
"Parsed value is empty in strict mode while parsing object, raising an error",
|
|
)
|
|
raise ValueError("Parsed value is empty in strict mode while parsing object.")
|
|
|
|
if schema_repairer is None or not drop_property:
|
|
obj[key] = value
|
|
else:
|
|
schema_repairer._log("Dropped extra property not covered by schema", key_path)
|
|
|
|
if self.get_char_at() in [",", "'", '"']:
|
|
self.index += 1
|
|
if self.get_char_at() == "]" and ContextValues.ARRAY in self.context.context:
|
|
self.log(
|
|
"While parsing an object we found a closing array bracket, closing the object here and rolling back the index"
|
|
)
|
|
self.index -= 1
|
|
break
|
|
self.skip_whitespaces()
|
|
|
|
self.index += 1
|
|
|
|
repaired_empty_object, repaired_value = _repair_empty_object_result(
|
|
self,
|
|
obj,
|
|
start_index,
|
|
schema,
|
|
path,
|
|
schema_repairer,
|
|
)
|
|
if repaired_empty_object:
|
|
return repaired_value
|
|
|
|
return _complete_object_parse(
|
|
self,
|
|
obj,
|
|
schema,
|
|
path,
|
|
schema_repairer,
|
|
schema_config,
|
|
)
|