0.2: Json schema extractor finished.

2026-01-23 18:17:45 +08:00
parent 81df352607
commit 0a78c78fc7
10 changed files with 9038 additions and 0 deletions
--- a/utils/json_extractor/analyze_structure.py
+++ b/utils/json_extractor/analyze_structure.py
@@ -0,0 +1,101 @@
+import json
+import os
+from pathlib import Path
+from collections import defaultdict
+
+def analyze_structures(root_dir):
+    p = Path(root_dir)
+    files = list(p.rglob("iframe_network.json"))
+    
+    fight_keys = set()
+    fight_t_keys = set()
+    fight_ct_keys = set()
+    
+    file_categories = defaultdict(set)
+    
+    for filepath in files:
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+        except:
+            continue
+            
+        if not isinstance(data, list):
+            continue
+            
+        has_round = False
+        has_leetify = False
+        
+        for entry in data:
+            url = entry.get('url', '')
+            body = entry.get('body')
+            
+            if "api/match/round/" in url:
+                has_round = True
+            if "api/match/leetify_rating/" in url:
+                has_leetify = True
+                
+            # Check for fight structures in data/match
+            if "api/data/match/" in url and isinstance(body, dict):
+                main_data = body.get('data', {})
+                if isinstance(main_data, dict):
+                    # Check group_N -> items -> fight/fight_t/fight_ct
+                    for k, v in main_data.items():
+                        if k.startswith('group_') and isinstance(v, list):
+                            for player in v:
+                                if isinstance(player, dict):
+                                    if 'fight' in player and isinstance(player['fight'], dict):
+                                        fight_keys.update(player['fight'].keys())
+                                    if 'fight_t' in player and isinstance(player['fight_t'], dict):
+                                        fight_t_keys.update(player['fight_t'].keys())
+                                    if 'fight_ct' in player and isinstance(player['fight_ct'], dict):
+                                        fight_ct_keys.update(player['fight_ct'].keys())
+
+        if has_round:
+            file_categories['round_only'].add(str(filepath))
+        if has_leetify:
+            file_categories['leetify_only'].add(str(filepath))
+        if has_round and has_leetify:
+            file_categories['both'].add(str(filepath))
+
+    print("Structure Analysis Results:")
+    print("-" * 30)
+    print(f"Files with Round API: {len(file_categories['round_only'])}")
+    print(f"Files with Leetify API: {len(file_categories['leetify_only'])}")
+    print(f"Files with BOTH: {len(file_categories['both'])}")
+    
+    # Calculate intersections for files
+    round_files = file_categories['round_only']
+    leetify_files = file_categories['leetify_only']
+    intersection = round_files.intersection(leetify_files) # This should be same as 'both' logic above if set correctly, but let's be explicit
+    # Actually my logic above adds to sets independently.
+    
+    only_round = round_files - leetify_files
+    only_leetify = leetify_files - round_files
+    both = round_files.intersection(leetify_files)
+    
+    print(f"Files with ONLY Round: {len(only_round)}")
+    print(f"Files with ONLY Leetify: {len(only_leetify)}")
+    print(f"Files with BOTH: {len(both)}")
+    
+    print("\nFight Structure Analysis:")
+    print("-" * 30)
+    print(f"Fight keys count: {len(fight_keys)}")
+    print(f"Fight_T keys count: {len(fight_t_keys)}")
+    print(f"Fight_CT keys count: {len(fight_ct_keys)}")
+    
+    all_keys = fight_keys | fight_t_keys | fight_ct_keys
+    
+    missing_in_fight = all_keys - fight_keys
+    missing_in_t = all_keys - fight_t_keys
+    missing_in_ct = all_keys - fight_ct_keys
+    
+    if not missing_in_fight and not missing_in_t and not missing_in_ct:
+        print("PERFECT MATCH: fight, fight_t, and fight_ct have identical keys.")
+    else:
+        if missing_in_fight: print(f"Keys missing in 'fight': {missing_in_fight}")
+        if missing_in_t: print(f"Keys missing in 'fight_t': {missing_in_t}")
+        if missing_in_ct: print(f"Keys missing in 'fight_ct': {missing_in_ct}")
+
+if __name__ == "__main__":
+    analyze_structures("output_arena")
--- a/utils/json_extractor/extractor.py
+++ b/utils/json_extractor/extractor.py
@@ -0,0 +1,243 @@
+import json
+import os
+from pathlib import Path
+from urllib.parse import urlparse
+from collections import defaultdict
+from .rules import is_ignored_url, get_key_mask, get_value_type
+
+class SchemaExtractor:
+    def __init__(self):
+        # schemas: category -> schema_node
+        self.schemas = {}
+        self.url_counts = defaultdict(int)
+
+    def get_url_category(self, url):
+        """
+        Derives a category name from the URL.
+        """
+        parsed = urlparse(url)
+        path = parsed.path
+        parts = path.strip('/').split('/')
+        cleaned_parts = []
+        for p in parts:
+            # Mask Match IDs (e.g., g161-...)
+            if p.startswith('g161-'):
+                cleaned_parts.append('{match_id}')
+            # Mask other long numeric IDs
+            elif p.isdigit() and len(p) > 4:
+                cleaned_parts.append('{id}')
+            else:
+                cleaned_parts.append(p)
+        
+        category = "/".join(cleaned_parts)
+        if not category:
+            category = "root"
+        return category
+
+    def process_directory(self, root_dir):
+        """
+        Iterates over all iframe_network.json files in the directory.
+        """
+        p = Path(root_dir)
+        # Use rglob to find all iframe_network.json files
+        files = list(p.rglob("iframe_network.json"))
+        print(f"Found {len(files)} files to process.")
+        
+        for i, filepath in enumerate(files):
+            if i % 10 == 0:
+                print(f"Processing {i}/{len(files)}: {filepath}")
+            self.process_file(filepath)
+
+    def process_file(self, filepath):
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+        except Exception as e:
+            # print(f"Error reading {filepath}: {e}")
+            return
+
+        if not isinstance(data, list):
+            return
+
+        for entry in data:
+            url = entry.get('url', '')
+            if not url or is_ignored_url(url):
+                continue
+            
+            status = entry.get('status')
+            if status != 200:
+                continue
+
+            body = entry.get('body')
+            # Skip empty bodies or bodies that are just empty dicts if that's not useful
+            if not body:
+                continue
+
+            category = self.get_url_category(url)
+            self.url_counts[category] += 1
+            
+            if category not in self.schemas:
+                self.schemas[category] = None
+            
+            self.schemas[category] = self.merge_value(self.schemas[category], body)
+
+    def merge_value(self, schema, value):
+        """
+        Merges a value into the existing schema.
+        """
+        val_type = get_value_type(value)
+        
+        if schema is None:
+            schema = {
+                "types": {val_type},
+                "count": 1
+            }
+        else:
+            schema["count"] += 1
+            schema["types"].add(val_type)
+
+        # Handle Dicts
+        if isinstance(value, dict):
+            if "properties" not in schema:
+                schema["properties"] = {}
+            
+            for k, v in value.items():
+                masked_key = get_key_mask(k)
+                schema["properties"][masked_key] = self.merge_value(
+                    schema["properties"].get(masked_key), 
+                    v
+                )
+        
+        # Handle Lists
+        elif isinstance(value, list):
+            if "items" not in schema:
+                schema["items"] = None
+            
+            for item in value:
+                schema["items"] = self.merge_value(schema["items"], item)
+        
+        # Handle Primitives (Capture examples if needed, currently just tracking types)
+        else:
+            if "examples" not in schema:
+                schema["examples"] = set()
+            if len(schema["examples"]) < 5:
+                # Store string representation to avoid type issues in set
+                schema["examples"].add(str(value))
+
+        return schema
+
+    def to_serializable(self, schema):
+        """
+        Converts the internal schema structure (with sets) to a JSON-serializable format.
+        """
+        if schema is None:
+            return None
+            
+        res = {
+            "types": list(sorted(schema["types"])),
+            "count": schema["count"]
+        }
+        
+        if "properties" in schema:
+            res["properties"] = {
+                k: self.to_serializable(v) 
+                for k, v in sorted(schema["properties"].items())
+            }
+            
+        if "items" in schema:
+            res["items"] = self.to_serializable(schema["items"])
+            
+        if "examples" in schema:
+            res["examples"] = list(sorted(schema["examples"]))
+            
+        return res
+
+    def export_report(self, output_path):
+        report = {}
+        for category, schema in self.schemas.items():
+            report[category] = self.to_serializable(schema)
+            
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(report, f, indent=2, ensure_ascii=False)
+        print(f"Report saved to {output_path}")
+
+    def export_markdown_summary(self, output_path):
+        """
+        Generates a Markdown summary of the hierarchy.
+        """
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write("# Schema Hierarchy Report\n\n")
+            
+            for category, schema in sorted(self.schemas.items()):
+                f.write(f"## Category: `{category}`\n")
+                f.write(f"**Total Requests**: {self.url_counts[category]}\n\n")
+                
+                self._write_markdown_schema(f, schema, level=0)
+                f.write("\n---\n\n")
+        print(f"Markdown summary saved to {output_path}")
+
+    def export_csv_summary(self, output_path):
+        """
+        Generates a CSV summary of the flattened schema.
+        """
+        import csv
+        with open(output_path, 'w', encoding='utf-8', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow(["Category", "Path", "Types", "Examples"])
+            
+            for category, schema in sorted(self.schemas.items()):
+                self._write_csv_schema(writer, category, schema, path="")
+        print(f"CSV summary saved to {output_path}")
+
+    def _write_csv_schema(self, writer, category, schema, path):
+        if schema is None:
+            return
+
+        current_types = list(sorted(schema["types"]))
+        type_str = ", ".join(map(str, current_types))
+        
+        # If it's a leaf or has no properties/items
+        is_leaf = "properties" not in schema and "items" not in schema
+        
+        if is_leaf:
+            examples = list(schema.get("examples", []))
+            ex_str = "; ".join(examples[:3]) if examples else ""
+            writer.writerow([category, path, type_str, ex_str])
+        
+        if "properties" in schema:
+            for k, v in schema["properties"].items():
+                new_path = f"{path}.{k}" if path else k
+                self._write_csv_schema(writer, category, v, new_path)
+        
+        if "items" in schema:
+            new_path = f"{path}[]"
+            self._write_csv_schema(writer, category, schema["items"], new_path)
+
+    def _write_markdown_schema(self, f, schema, level=0):
+        if schema is None:
+            return
+
+        indent = "  " * level
+        types = schema["types"]
+        type_str = ", ".join([str(t) for t in types])
+        
+        # If it's a leaf (no props, no items)
+        if "properties" not in schema and "items" not in schema:
+            # Show examples
+            examples = schema.get("examples", [])
+            ex_str = f" (e.g., {', '.join(list(examples)[:3])})" if examples else ""
+            return # We handle leaf printing in the parent loop for keys, or here if it's a root/list item
+        
+        if "properties" in schema:
+            for k, v in schema["properties"].items():
+                v_types = ", ".join(list(sorted(v["types"])))
+                v_ex = list(v.get("examples", []))
+                v_ex_str = f", e.g. {v_ex[0]}" if v_ex and "dict" not in v["types"] and "list" not in v["types"] else ""
+                
+                f.write(f"{indent}- **{k}** ({v_types}{v_ex_str})\n")
+                self._write_markdown_schema(f, v, level + 1)
+        
+        if "items" in schema:
+            f.write(f"{indent}- *[Array Items]*\n")
+            self._write_markdown_schema(f, schema["items"], level + 1)
+
--- a/utils/json_extractor/main.py
+++ b/utils/json_extractor/main.py
@@ -0,0 +1,35 @@
+import sys
+import os
+import argparse
+
+# Add project root to path so we can import utils.json_extractor
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(os.path.dirname(current_dir))
+sys.path.append(project_root)
+
+from utils.json_extractor.extractor import SchemaExtractor
+
+def main():
+    parser = argparse.ArgumentParser(description="Extract JSON schema from 5E Arena data.")
+    parser.add_argument("--input", default="output_arena", help="Input directory containing iframe_network.json files")
+    parser.add_argument("--output-json", default="output_reports/schema_full.json", help="Output JSON report path")
+    parser.add_argument("--output-md", default="output_reports/schema_summary.md", help="Output Markdown summary path")
+    parser.add_argument("--output-csv", default="output_reports/schema_flat.csv", help="Output CSV flat report path")
+    
+    args = parser.parse_args()
+    
+    print(f"Starting extraction from {args.input}...")
+    extractor = SchemaExtractor()
+    extractor.process_directory(args.input)
+    
+    # Ensure output directory exists
+    os.makedirs(os.path.dirname(args.output_json), exist_ok=True)
+    os.makedirs(os.path.dirname(args.output_md), exist_ok=True)
+    
+    extractor.export_report(args.output_json)
+    extractor.export_markdown_summary(args.output_md)
+    extractor.export_csv_summary(args.output_csv)
+    print("Done.")
+
+if __name__ == "__main__":
+    main()
--- a/utils/json_extractor/rules.py
+++ b/utils/json_extractor/rules.py
@@ -0,0 +1,81 @@
+import re
+
+# Regex patterns for masking sensitive/dynamic data
+STEAMID_REGEX = re.compile(r"^7656\d+$")
+FIVE_E_ID_REGEX = re.compile(r"^1\d{7}$")  # 1 followed by 7 digits (8 digits total)
+
+# Group merging
+GROUP_KEY_REGEX = re.compile(r"^group_\d+$")
+
+# URL Exclusion patterns
+# We skip these URLs as they are analytics/auth related and not data payload
+IGNORE_URL_PATTERNS = [
+    r"sentry_key=",
+    r"gate\.5eplay\.com/blacklistfront",
+    r"favicon\.ico",
+]
+
+# URL Inclusion/Interest patterns (Optional, if we want to be strict)
+# INTEREST_URL_PATTERNS = [
+#     r"api/data/match",
+#     r"leetify",
+# ]
+
+def is_ignored_url(url):
+    for pattern in IGNORE_URL_PATTERNS:
+        if re.search(pattern, url):
+            return True
+    return False
+
+def get_key_mask(key):
+    """
+    Returns a masked key name if it matches a pattern (e.g. group_1 -> group_N).
+    Otherwise returns the key itself.
+    """
+    if GROUP_KEY_REGEX.match(key):
+        return "group_N"
+    if STEAMID_REGEX.match(key):
+        return "<steamid>"
+    if FIVE_E_ID_REGEX.match(key):
+        return "<5eid>"
+    
+    # Merge fight variants
+    if key in ["fight", "fight_t", "fight_ct"]:
+        return "fight_any"
+        
+    # Merge numeric keys (likely round numbers)
+    if key.isdigit():
+        return "<round_n>"
+        
+    return key
+
+def get_value_type(value):
+    """
+    Returns a generalized type string for a value, masking IDs.
+    """
+    if value is None:
+        return "null"
+    if isinstance(value, bool):
+        return "bool"
+    if isinstance(value, int):
+        # Check for IDs
+        s_val = str(value)
+        if FIVE_E_ID_REGEX.match(s_val):
+            return "<5eid>"
+        if STEAMID_REGEX.match(s_val):
+            return "<steamid>"
+        return "int"
+    if isinstance(value, float):
+        return "float"
+    if isinstance(value, str):
+        if FIVE_E_ID_REGEX.match(value):
+            return "<5eid>"
+        if STEAMID_REGEX.match(value):
+            return "<steamid>"
+        # Heuristic for other IDs or timestamps could go here
+        return "string"
+    if isinstance(value, list):
+        return "list"
+    if isinstance(value, dict):
+        return "dict"
+    return "unknown"