utils/json_extractor/extractor.py

import json
import os
from pathlib import Path
from urllib.parse import urlparse
from collections import defaultdict
from .rules import is_ignored_url, get_key_mask, get_value_type

class SchemaExtractor:
    def __init__(self):
        # schemas: category -> schema_node
        self.schemas = {}
        self.url_counts = defaultdict(int)

    def get_url_category(self, url):
        """
        Derives a category name from the URL.
        """
        parsed = urlparse(url)
        path = parsed.path
        parts = path.strip('/').split('/')
        cleaned_parts = []
        for p in parts:
            # Mask Match IDs (e.g., g161-...)
            if p.startswith('g161-'):
                cleaned_parts.append('{match_id}')
            # Mask other long numeric IDs
            elif p.isdigit() and len(p) > 4:
                cleaned_parts.append('{id}')
            else:
                cleaned_parts.append(p)
        
        category = "/".join(cleaned_parts)
        if not category:
            category = "root"
        return category

    def process_directory(self, root_dir):
        """
        Iterates over all iframe_network.json files in the directory.
        """
        p = Path(root_dir)
        # Use rglob to find all iframe_network.json files
        files = list(p.rglob("iframe_network.json"))
        print(f"Found {len(files)} files to process.")
        
        for i, filepath in enumerate(files):
            if i % 10 == 0:
                print(f"Processing {i}/{len(files)}: {filepath}")
            self.process_file(filepath)

    def process_file(self, filepath):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except Exception as e:
            # print(f"Error reading {filepath}: {e}")
            return

        if not isinstance(data, list):
            return

        for entry in data:
            url = entry.get('url', '')
            if not url or is_ignored_url(url):
                continue
            
            status = entry.get('status')
            if status != 200:
                continue

            body = entry.get('body')
            # Skip empty bodies or bodies that are just empty dicts if that's not useful
            if not body:
                continue

            category = self.get_url_category(url)
            self.url_counts[category] += 1
            
            if category not in self.schemas:
                self.schemas[category] = None
            
            self.schemas[category] = self.merge_value(self.schemas[category], body)

    def merge_value(self, schema, value):
        """
        Merges a value into the existing schema.
        """
        val_type = get_value_type(value)
        
        if schema is None:
            schema = {
                "types": {val_type},
                "count": 1
            }
        else:
            schema["count"] += 1
            schema["types"].add(val_type)

        # Handle Dicts
        if isinstance(value, dict):
            if "properties" not in schema:
                schema["properties"] = {}
            
            for k, v in value.items():
                masked_key = get_key_mask(k)
                schema["properties"][masked_key] = self.merge_value(
                    schema["properties"].get(masked_key), 
                    v
                )
        
        # Handle Lists
        elif isinstance(value, list):
            if "items" not in schema:
                schema["items"] = None
            
            for item in value:
                schema["items"] = self.merge_value(schema["items"], item)
        
        # Handle Primitives (Capture examples if needed, currently just tracking types)
        else:
            if "examples" not in schema:
                schema["examples"] = set()
            if len(schema["examples"]) < 5:
                # Store string representation to avoid type issues in set
                schema["examples"].add(str(value))

        return schema

    def to_serializable(self, schema):
        """
        Converts the internal schema structure (with sets) to a JSON-serializable format.
        """
        if schema is None:
            return None
            
        res = {
            "types": list(sorted(schema["types"])),
            "count": schema["count"]
        }
        
        if "properties" in schema:
            res["properties"] = {
                k: self.to_serializable(v) 
                for k, v in sorted(schema["properties"].items())
            }
            
        if "items" in schema:
            res["items"] = self.to_serializable(schema["items"])
            
        if "examples" in schema:
            res["examples"] = list(sorted(schema["examples"]))
            
        return res

    def export_report(self, output_path):
        report = {}
        for category, schema in self.schemas.items():
            report[category] = self.to_serializable(schema)
            
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2, ensure_ascii=False)
        print(f"Report saved to {output_path}")

    def export_markdown_summary(self, output_path):
        """
        Generates a Markdown summary of the hierarchy.
        """
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write("# Schema Hierarchy Report\n\n")
            
            for category, schema in sorted(self.schemas.items()):
                f.write(f"## Category: `{category}`\n")
                f.write(f"**Total Requests**: {self.url_counts[category]}\n\n")
                
                self._write_markdown_schema(f, schema, level=0)
                f.write("\n---\n\n")
        print(f"Markdown summary saved to {output_path}")

    def export_csv_summary(self, output_path):
        """
        Generates a CSV summary of the flattened schema.
        """
        import csv
        with open(output_path, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(["Category", "Path", "Types", "Examples"])
            
            for category, schema in sorted(self.schemas.items()):
                self._write_csv_schema(writer, category, schema, path="")
        print(f"CSV summary saved to {output_path}")

    def _write_csv_schema(self, writer, category, schema, path):
        if schema is None:
            return

        current_types = list(sorted(schema["types"]))
        type_str = ", ".join(map(str, current_types))
        
        # If it's a leaf or has no properties/items
        is_leaf = "properties" not in schema and "items" not in schema
        
        if is_leaf:
            examples = list(schema.get("examples", []))
            ex_str = "; ".join(examples[:3]) if examples else ""
            writer.writerow([category, path, type_str, ex_str])
        
        if "properties" in schema:
            for k, v in schema["properties"].items():
                new_path = f"{path}.{k}" if path else k
                self._write_csv_schema(writer, category, v, new_path)
        
        if "items" in schema:
            new_path = f"{path}[]"
            self._write_csv_schema(writer, category, schema["items"], new_path)

    def _write_markdown_schema(self, f, schema, level=0):
        if schema is None:
            return

        indent = "  " * level
        types = schema["types"]
        type_str = ", ".join([str(t) for t in types])
        
        # If it's a leaf (no props, no items)
        if "properties" not in schema and "items" not in schema:
            # Show examples
            examples = schema.get("examples", [])
            ex_str = f" (e.g., {', '.join(list(examples)[:3])})" if examples else ""
            return # We handle leaf printing in the parent loop for keys, or here if it's a root/list item
        
        if "properties" in schema:
            for k, v in schema["properties"].items():
                v_types = ", ".join(list(sorted(v["types"])))
                v_ex = list(v.get("examples", []))
                v_ex_str = f", e.g. {v_ex[0]}" if v_ex and "dict" not in v["types"] and "list" not in v["types"] else ""
                
                f.write(f"{indent}- **{k}** ({v_types}{v_ex_str})\n")
                self._write_markdown_schema(f, v, level + 1)
        
        if "items" in schema:
            f.write(f"{indent}- *[Array Items]*\n")
            self._write_markdown_schema(f, schema["items"], level + 1)
0.2: Json schema extractor finished. 2026-01-23 18:17:45 +08:00			`import json`
			`import os`
			`from pathlib import Path`
			`from urllib.parse import urlparse`
			`from collections import defaultdict`
			`from .rules import is_ignored_url, get_key_mask, get_value_type`

			`class SchemaExtractor:`
			`def __init__(self):`
			`# schemas: category -> schema_node`
			`self.schemas = {}`
			`self.url_counts = defaultdict(int)`

			`def get_url_category(self, url):`
			`"""`
			`Derives a category name from the URL.`
			`"""`
			`parsed = urlparse(url)`
			`path = parsed.path`
			`parts = path.strip('/').split('/')`
			`cleaned_parts = []`
			`for p in parts:`
			`# Mask Match IDs (e.g., g161-...)`
			`if p.startswith('g161-'):`
			`cleaned_parts.append('{match_id}')`
			`# Mask other long numeric IDs`
			`elif p.isdigit() and len(p) > 4:`
			`cleaned_parts.append('{id}')`
			`else:`
			`cleaned_parts.append(p)`

			`category = "/".join(cleaned_parts)`
			`if not category:`
			`category = "root"`
			`return category`

			`def process_directory(self, root_dir):`
			`"""`
			`Iterates over all iframe_network.json files in the directory.`
			`"""`
			`p = Path(root_dir)`
			`# Use rglob to find all iframe_network.json files`
			`files = list(p.rglob("iframe_network.json"))`
			`print(f"Found {len(files)} files to process.")`

			`for i, filepath in enumerate(files):`
			`if i % 10 == 0:`
			`print(f"Processing {i}/{len(files)}: {filepath}")`
			`self.process_file(filepath)`

			`def process_file(self, filepath):`
			`try:`
			`with open(filepath, 'r', encoding='utf-8') as f:`
			`data = json.load(f)`
			`except Exception as e:`
			`# print(f"Error reading {filepath}: {e}")`
			`return`

			`if not isinstance(data, list):`
			`return`

			`for entry in data:`
			`url = entry.get('url', '')`
			`if not url or is_ignored_url(url):`
			`continue`

			`status = entry.get('status')`
			`if status != 200:`
			`continue`

			`body = entry.get('body')`
			`# Skip empty bodies or bodies that are just empty dicts if that's not useful`
			`if not body:`
			`continue`

			`category = self.get_url_category(url)`
			`self.url_counts[category] += 1`

			`if category not in self.schemas:`
			`self.schemas[category] = None`

			`self.schemas[category] = self.merge_value(self.schemas[category], body)`

			`def merge_value(self, schema, value):`
			`"""`
			`Merges a value into the existing schema.`
			`"""`
			`val_type = get_value_type(value)`

			`if schema is None:`
			`schema = {`
			`"types": {val_type},`
			`"count": 1`
			`}`
			`else:`
			`schema["count"] += 1`
			`schema["types"].add(val_type)`

			`# Handle Dicts`
			`if isinstance(value, dict):`
			`if "properties" not in schema:`
			`schema["properties"] = {}`

			`for k, v in value.items():`
			`masked_key = get_key_mask(k)`
			`schema["properties"][masked_key] = self.merge_value(`
			`schema["properties"].get(masked_key),`
			`v`
			`)`

			`# Handle Lists`
			`elif isinstance(value, list):`
			`if "items" not in schema:`
			`schema["items"] = None`

			`for item in value:`
			`schema["items"] = self.merge_value(schema["items"], item)`

			`# Handle Primitives (Capture examples if needed, currently just tracking types)`
			`else:`
			`if "examples" not in schema:`
			`schema["examples"] = set()`
			`if len(schema["examples"]) < 5:`
			`# Store string representation to avoid type issues in set`
			`schema["examples"].add(str(value))`

			`return schema`

			`def to_serializable(self, schema):`
			`"""`
			`Converts the internal schema structure (with sets) to a JSON-serializable format.`
			`"""`
			`if schema is None:`
			`return None`

			`res = {`
			`"types": list(sorted(schema["types"])),`
			`"count": schema["count"]`
			`}`

			`if "properties" in schema:`
			`res["properties"] = {`
			`k: self.to_serializable(v)`
			`for k, v in sorted(schema["properties"].items())`
			`}`

			`if "items" in schema:`
			`res["items"] = self.to_serializable(schema["items"])`

			`if "examples" in schema:`
			`res["examples"] = list(sorted(schema["examples"]))`

			`return res`

			`def export_report(self, output_path):`
			`report = {}`
			`for category, schema in self.schemas.items():`
			`report[category] = self.to_serializable(schema)`

			`with open(output_path, 'w', encoding='utf-8') as f:`
			`json.dump(report, f, indent=2, ensure_ascii=False)`
			`print(f"Report saved to {output_path}")`

			`def export_markdown_summary(self, output_path):`
			`"""`
			`Generates a Markdown summary of the hierarchy.`
			`"""`
			`with open(output_path, 'w', encoding='utf-8') as f:`
			`f.write("# Schema Hierarchy Report\n\n")`

			`for category, schema in sorted(self.schemas.items()):`
			f.write(f"## Category: `{category}`\n")
			`f.write(f"Total Requests: {self.url_counts[category]}\n\n")`

			`self._write_markdown_schema(f, schema, level=0)`
			`f.write("\n---\n\n")`
			`print(f"Markdown summary saved to {output_path}")`

			`def export_csv_summary(self, output_path):`
			`"""`
			`Generates a CSV summary of the flattened schema.`
			`"""`
			`import csv`
			`with open(output_path, 'w', encoding='utf-8', newline='') as f:`
			`writer = csv.writer(f)`
			`writer.writerow(["Category", "Path", "Types", "Examples"])`

			`for category, schema in sorted(self.schemas.items()):`
			`self._write_csv_schema(writer, category, schema, path="")`
			`print(f"CSV summary saved to {output_path}")`

			`def _write_csv_schema(self, writer, category, schema, path):`
			`if schema is None:`
			`return`

			`current_types = list(sorted(schema["types"]))`
			`type_str = ", ".join(map(str, current_types))`

			`# If it's a leaf or has no properties/items`
			`is_leaf = "properties" not in schema and "items" not in schema`

			`if is_leaf:`
			`examples = list(schema.get("examples", []))`
			`ex_str = "; ".join(examples[:3]) if examples else ""`
			`writer.writerow([category, path, type_str, ex_str])`

			`if "properties" in schema:`
			`for k, v in schema["properties"].items():`
			`new_path = f"{path}.{k}" if path else k`
			`self._write_csv_schema(writer, category, v, new_path)`

			`if "items" in schema:`
			`new_path = f"{path}[]"`
			`self._write_csv_schema(writer, category, schema["items"], new_path)`

			`def _write_markdown_schema(self, f, schema, level=0):`
			`if schema is None:`
			`return`

			`indent = " " * level`
			`types = schema["types"]`
			`type_str = ", ".join([str(t) for t in types])`

			`# If it's a leaf (no props, no items)`
			`if "properties" not in schema and "items" not in schema:`
			`# Show examples`
			`examples = schema.get("examples", [])`
			`ex_str = f" (e.g., {', '.join(list(examples)[:3])})" if examples else ""`
			`return # We handle leaf printing in the parent loop for keys, or here if it's a root/list item`

			`if "properties" in schema:`
			`for k, v in schema["properties"].items():`
			`v_types = ", ".join(list(sorted(v["types"])))`
			`v_ex = list(v.get("examples", []))`
			`v_ex_str = f", e.g. {v_ex[0]}" if v_ex and "dict" not in v["types"] and "list" not in v["types"] else ""`

			`f.write(f"{indent}- {k} ({v_types}{v_ex_str})\n")`
			`self._write_markdown_schema(f, v, level + 1)`

			`if "items" in schema:`
			`f.write(f"{indent}- [Array Items]\n")`
			`self._write_markdown_schema(f, schema["items"], level + 1)`