MoFin/venv/lib/python3.12/site-packages/json_repair/parse_object.py

from typing import TYPE_CHECKING, Any, cast

from .parser_schema import ObjectSchemaConfig, resolve_parser_object_schema
from .utils.constants import MISSING_VALUE, STRING_DELIMITERS, JSONReturnType
from .utils.json_context import ContextValues
from .utils.pattern_properties import match_pattern_properties

if TYPE_CHECKING:
    from .json_parser import JSONParser
    from .schema_repair import SchemaRepairer


def _finalize_object(
    obj: dict[str, JSONReturnType],
    schema_repairer: "SchemaRepairer | None",
    schema_config: ObjectSchemaConfig | None,
    path: str,
) -> dict[str, JSONReturnType]:
    if schema_repairer is None or schema_config is None:
        return obj

    missing_required = [key for key in schema_config.required if key not in obj]
    if missing_required and schema_repairer.schema_repair_mode != "salvage":
        raise ValueError(f"Missing required properties at {path}: {', '.join(missing_required)}")

    for key, prop_schema in schema_config.properties.items():
        if key in obj or key in schema_config.required:
            continue
        if isinstance(prop_schema, dict) and "default" in prop_schema:
            obj[key] = schema_repairer._copy_json_value(prop_schema["default"], f"{path}.{key}", "default")
            schema_repairer._log("Inserted default value for missing property", f"{path}.{key}")
    return obj


def _strip_comments_for_empty_object_classification(body: str) -> str:
    stripped = []
    in_quote: str | None = None
    backslashes = 0
    index = 0
    while index < len(body):
        char = body[index]
        next_char = body[index + 1] if index + 1 < len(body) else ""

        if char == "\\":
            backslashes += 1
            stripped.append(char)
            index += 1
            continue
        if in_quote is not None:
            stripped.append(char)
            if char == in_quote and backslashes % 2 == 0:
                in_quote = None
            backslashes = 0
            index += 1
            continue
        if char in STRING_DELIMITERS and backslashes % 2 == 0:
            in_quote = char
            stripped.append(char)
            backslashes = 0
            index += 1
            continue
        backslashes = 0

        if char == "#" or (char == "/" and next_char == "/"):
            index += 2 if char == "/" else 1
            while index < len(body) and body[index] not in ["\n", "\r"]:
                index += 1
            continue
        if char == "/" and next_char == "*":
            index += 2
            while index < len(body) - 1 and body[index : index + 2] != "*/":
                index += 1
            index = min(index + 2, len(body))
            continue

        stripped.append(char)
        index += 1

    return "".join(stripped)


def _classify_empty_object_repair(
    self: "JSONParser",
    start_index: int,
    schema: dict[str, Any] | bool | None,
    schema_repairer: "SchemaRepairer | None",
) -> tuple[str, str | None]:
    attempted_object = self.json_str[start_index - 1 : self.index + 1]
    body = attempted_object[1:]
    body = body.removesuffix("}")
    body = body.lstrip()
    if not body:
        return "keep", None
    if (body.startswith('\\"') and '\\":' in body) or (body.startswith("\\'") and "\\':" in body):
        normalized_object = attempted_object.replace('\\"', '"').replace("\\'", "'")
        self.log(
            "Parsed object is empty but the input starts like an escaped object key, normalizing and reparsing it as an object",
        )
        return "object", normalized_object
    body = _strip_comments_for_empty_object_classification(body).lstrip()
    if not body:
        return "keep", None

    in_quote: str | None = None
    backslashes = 0
    for char in body:
        if char == "\\":
            backslashes += 1
            continue
        if in_quote is not None:
            if char == in_quote and backslashes % 2 == 0:
                in_quote = None
        elif char in STRING_DELIMITERS and backslashes % 2 == 0:
            in_quote = char
        elif char == ":" and backslashes % 2 == 0:
            self.log(
                "Parsed object is empty but the input still contains an object-style separator, keeping object repair",
            )
            return "keep", None
        backslashes = 0
    if (
        schema_repairer is not None
        and schema_repairer.schema_repair_mode == "salvage"
        and isinstance(schema, dict)
        and schema_repairer.is_object_schema(schema)
        and not schema_repairer.is_array_schema(schema)
    ):
        return "schema_set_object", None
    return "array", None


def _merge_object_array_continuation(
    self: "JSONParser",
    obj: dict[str, JSONReturnType],
) -> bool:
    prev_key = list(obj.keys())[-1] if obj else None
    if not prev_key or not isinstance(obj[prev_key], list) or self.strict:
        return False

    self.index += 1
    new_array = self.parse_array()
    if isinstance(new_array, list):
        prev_value = obj[prev_key]
        if isinstance(prev_value, list):
            list_lengths = [len(item) for item in prev_value if isinstance(item, list)]
            expected_len = (
                list_lengths[0] if list_lengths and all(length == list_lengths[0] for length in list_lengths) else None
            )
            if expected_len:
                tail = []
                while prev_value and not isinstance(prev_value[-1], list):
                    tail.append(prev_value.pop())
                if tail:
                    tail.reverse()
                    if len(tail) % expected_len == 0:
                        self.log(
                            "While parsing an object we found row values without an inner array, grouping them into rows",
                        )
                        for i in range(0, len(tail), expected_len):
                            prev_value.append(tail[i : i + expected_len])
                    else:
                        prev_value.extend(tail)
                if new_array:
                    if all(isinstance(item, list) for item in new_array):
                        self.log(
                            "While parsing an object we found additional rows, appending them without flattening",
                        )
                        prev_value.extend(new_array)
                    else:
                        prev_value.append(new_array)
            else:
                prev_value.extend(new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array)

    self.skip_whitespaces()
    if self.get_char_at() == ",":
        self.index += 1
    self.skip_whitespaces()
    return True


def _parse_object_key(
    self: "JSONParser",
    obj: dict[str, JSONReturnType],
) -> tuple[str, int]:
    key = ""
    rollback_index = self.index
    self.context.set(ContextValues.OBJECT_KEY)
    try:
        while self.get_char_at():
            rollback_index = self.index
            if self.get_char_at() == "[" and key == "" and _merge_object_array_continuation(self, obj):
                continue

            raw_key = self.parse_string()
            assert isinstance(raw_key, str)
            key = raw_key
            if key == "":
                self.skip_whitespaces()
            if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
                if key == "" and self.strict:
                    self.log(
                        "Empty key found in strict mode while parsing object, raising an error",
                    )
                    raise ValueError("Empty key found in strict mode while parsing object.")
                break
    finally:
        self.context.reset()
    return key, rollback_index


def _should_split_duplicate_object(self: "JSONParser", rollback_index: int) -> bool:
    lookback_idx = rollback_index - self.index - 1
    prev_non_whitespace = self.get_char_at(lookback_idx)
    while prev_non_whitespace and prev_non_whitespace.isspace():
        lookback_idx -= 1
        prev_non_whitespace = self.get_char_at(lookback_idx)
    key_start_char = self.get_char_at(rollback_index - self.index)
    next_non_whitespace = self.get_char_at(self.scroll_whitespaces())
    return not (key_start_char in STRING_DELIMITERS and prev_non_whitespace == "," and next_non_whitespace == ":")


def _split_object_on_duplicate_key(self: "JSONParser", rollback_index: int) -> None:
    self.index = rollback_index - 1
    self.json_str = self.json_str[: self.index + 1] + "{" + self.json_str[self.index + 1 :]


def _resolve_object_property_schema(
    self: "JSONParser",
    schema_repairer: "SchemaRepairer | None",
    schema_config: ObjectSchemaConfig | None,
    key: str,
) -> tuple[dict[str, Any] | bool | None, list[dict[str, Any] | bool | None], bool]:
    if schema_repairer is None or schema_config is None:
        return None, [], False

    prop_schema: dict[str, Any] | bool | None = None
    extra_schemas: list[dict[str, Any] | bool | None] = []
    if key in schema_config.properties:
        schema_value = schema_config.properties[key]
        if schema_value is not None and not isinstance(schema_value, (dict, bool)):
            raise ValueError("Schema must be an object.")
        prop_schema = cast("dict[str, Any] | bool | None", schema_value)
        return prop_schema, extra_schemas, False

    matched: list[Any] = []
    unsupported_patterns: list[str] = []
    if schema_config.pattern_properties:
        matched, unsupported_patterns = match_pattern_properties(schema_config.pattern_properties, key)
    for pattern in unsupported_patterns:
        self.log(
            f"Skipped unsupported patternProperties regex '{pattern}' while parsing object key '{key}'",
        )
    if matched:
        primary_schema = matched[0]
        if primary_schema is not None and not isinstance(primary_schema, (dict, bool)):
            raise ValueError("Schema must be an object.")
        prop_schema = cast("dict[str, Any] | bool | None", primary_schema)
        for extra_schema in matched[1:]:
            if extra_schema is not None and not isinstance(extra_schema, (dict, bool)):
                raise ValueError("Schema must be an object.")
            extra_schemas.append(cast("dict[str, Any] | bool | None", extra_schema))
        return prop_schema, extra_schemas, False

    if schema_config.additional_properties is False:
        return None, [], True
    if isinstance(schema_config.additional_properties, dict):
        return cast("dict[str, Any]", schema_config.additional_properties), [], False
    return True, [], False


def _parse_object_value(
    self: "JSONParser",
    schema_repairer: "SchemaRepairer | None",
    prop_schema: dict[str, Any] | bool | None,
    key_path: str,
) -> JSONReturnType:
    self.context.set(ContextValues.OBJECT_VALUE)
    try:
        self.skip_whitespaces()
        char = self.get_char_at()
        if char in [",", "}"]:
            self.log(
                f"While parsing an object value we found a stray {char}, ignoring it",
            )
            if schema_repairer is not None:
                return schema_repairer.repair_value(MISSING_VALUE, prop_schema, key_path)
            return ""

        if schema_repairer is not None:
            return self.parse_json(prop_schema, key_path)
        return self.parse_json()
    finally:
        self.context.reset()


def _repair_empty_object_result(
    self: "JSONParser",
    obj: dict[str, JSONReturnType],
    start_index: int,
    schema: dict[str, Any] | bool | None,
    path: str,
    schema_repairer: "SchemaRepairer | None",
) -> tuple[bool, JSONReturnType]:
    if obj or self.index - start_index <= 2:
        return False, None

    if self.strict:
        self.log(
            "Parsed object is empty but contains extra characters in strict mode, raising an error",
        )
        raise ValueError("Parsed object is empty but contains extra characters in strict mode.")

    empty_object_repair, normalized_object = _classify_empty_object_repair(self, start_index, schema, schema_repairer)
    if empty_object_repair == "object" and normalized_object is not None:
        end_index = self.index + 1
        self.json_str = self.json_str[: start_index - 1] + normalized_object + self.json_str[end_index:]
        self.index = start_index
        with self.context.enter(ContextValues.OBJECT_KEY):
            repaired_value = self.parse_object(schema, path)
        self.deferred_contexts.append(ContextValues.OBJECT_KEY)
        return True, repaired_value
    if empty_object_repair == "schema_set_object":
        self.log(
            "Parsed object is empty but salvage schema expects an object, reparsing set-like members as null-valued object keys",
        )
        self.index = start_index
        with self.context.enter(ContextValues.OBJECT_KEY):
            set_items = self.parse_array()
        self.deferred_contexts.append(ContextValues.OBJECT_KEY)
        if isinstance(set_items, list):
            key_candidates: list[str] = [item for item in set_items if isinstance(item, str) and item]
            if len(key_candidates) == len(set_items):
                return True, cast("JSONReturnType", dict.fromkeys(key_candidates))
        return True, set_items
    if empty_object_repair == "array":
        self.log("Parsed object is empty, we will try to parse this as an array instead")
        self.index = start_index
        with self.context.enter(ContextValues.OBJECT_KEY):
            repaired_array = self.parse_array()
        self.deferred_contexts.append(ContextValues.OBJECT_KEY)
        return True, repaired_array
    return False, None


def _complete_object_parse(
    self: "JSONParser",
    obj: dict[str, JSONReturnType],
    schema: dict[str, Any] | bool | None,
    path: str,
    schema_repairer: "SchemaRepairer | None",
    schema_config: ObjectSchemaConfig | None,
) -> JSONReturnType:
    if not self.context.empty:
        if self.get_char_at() == "}" and self.context.current not in [
            ContextValues.OBJECT_KEY,
            ContextValues.OBJECT_VALUE,
        ]:
            self.log(
                "Found an extra closing brace that shouldn't be there, skipping it",
            )
            self.index += 1
        return obj

    self.skip_whitespaces()
    if self.get_char_at() == ",":
        self.index += 1
        self.skip_whitespaces()
        if self.get_char_at() in STRING_DELIMITERS and not self.strict:
            self.log(
                "Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
            )
            additional_obj = self.parse_object(schema, path)
            if isinstance(additional_obj, dict):
                obj.update(additional_obj)

    return _finalize_object(obj, schema_repairer, schema_config, path)


def parse_object(
    self: "JSONParser",
    schema: dict[str, Any] | bool | None = None,
    path: str = "$",
) -> JSONReturnType:
    # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
    obj: dict[str, JSONReturnType] = {}
    start_index = self.index
    parsing_object_value = self.context.current == ContextValues.OBJECT_VALUE
    schema_repairer, schema, schema_config = resolve_parser_object_schema(self.schema_repairer, schema)

    while (self.get_char_at() or "}") != "}":
        self.skip_whitespaces()

        if self.get_char_at() == ":":
            self.log(
                "While parsing an object we found a : before a key, ignoring",
            )
            self.index += 1

        key, rollback_index = _parse_object_key(self, obj)
        if ContextValues.ARRAY in self.context.context and key in obj:
            if self.strict:
                self.log("Duplicate key found in strict mode while parsing object, raising an error")
                raise ValueError("Duplicate key found in strict mode while parsing object.")
            if not parsing_object_value:
                if _should_split_duplicate_object(self, rollback_index):
                    self.log(
                        "While parsing an object we found a duplicate key, closing the object here and rolling back the index",
                    )
                    _split_object_on_duplicate_key(self, rollback_index)
                    break
                self.log(
                    "While parsing an object we found a duplicate key with a normal comma separator, keeping duplicate-key overwrite behavior",
                )

        self.skip_whitespaces()
        if (self.get_char_at() or "}") == "}":
            continue

        self.skip_whitespaces()
        if self.get_char_at() != ":":
            if self.strict:
                self.log(
                    "Missing ':' after key in strict mode while parsing object, raising an error",
                )
                raise ValueError("Missing ':' after key in strict mode while parsing object.")
            self.log(
                "While parsing an object we missed a : after a key",
            )

        self.index += 1
        prop_schema, extra_schemas, drop_property = _resolve_object_property_schema(
            self,
            schema_repairer,
            schema_config,
            key,
        )
        key_path = f"{path}.{key}"
        value = _parse_object_value(self, schema_repairer, prop_schema, key_path)

        if schema_repairer is not None:
            for extra_schema in extra_schemas:
                value = schema_repairer.repair_value(value, extra_schema, key_path)

        if schema_repairer is None and value == "" and self.strict and self.get_char_at(-1) not in STRING_DELIMITERS:
            self.log(
                "Parsed value is empty in strict mode while parsing object, raising an error",
            )
            raise ValueError("Parsed value is empty in strict mode while parsing object.")

        if schema_repairer is None or not drop_property:
            obj[key] = value
        else:
            schema_repairer._log("Dropped extra property not covered by schema", key_path)

        if self.get_char_at() in [",", "'", '"']:
            self.index += 1
        if self.get_char_at() == "]" and ContextValues.ARRAY in self.context.context:
            self.log(
                "While parsing an object we found a closing array bracket, closing the object here and rolling back the index"
            )
            self.index -= 1
            break
        self.skip_whitespaces()

    self.index += 1

    repaired_empty_object, repaired_value = _repair_empty_object_result(
        self,
        obj,
        start_index,
        schema,
        path,
        schema_repairer,
    )
    if repaired_empty_object:
        return repaired_value

    return _complete_object_parse(
        self,
        obj,
        schema,
        path,
        schema_repairer,
        schema_config,
    )