feat: Add recent performance stability stats (matches/days) to player profile
This commit is contained in:
0
utils/__init__.py
Normal file
0
utils/__init__.py
Normal file
65
utils/json_extractor/README.md
Normal file
65
utils/json_extractor/README.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# JSON Schema Extractor
|
||||
|
||||
用于从大量 5E Arena 比赛数据 (`iframe_network.json`) 中提取、归纳和分析 JSON Schema 的工具。它能够自动处理复杂的嵌套结构,识别动态 Key(如 SteamID、5E ID、Round Number),并生成层级清晰的结构报告。
|
||||
|
||||
## ✨ 核心功能
|
||||
|
||||
* **批量处理**: 自动扫描并处理目录下的所有 `iframe_network.json` 文件。
|
||||
* **智能归并**:
|
||||
* **动态 Key 掩码**: 自动识别并掩盖 SteamID (`<steamid>`)、5E ID (`<5eid>`) 和回合数 (`<round_n>`)。
|
||||
* **结构合并**: 自动将 `group_1`/`group_2` 合并为 `group_N`,将 `fight`/`fight_t`/`fight_ct` 合并为 `fight_any`。
|
||||
* **多格式输出**:
|
||||
* `schema_summary.md`: 易于阅读的 Markdown 层级报告。
|
||||
* `schema_full.json`: 包含类型统计和完整结构的机器可读 JSON。
|
||||
* `schema_flat.csv`: 扁平化的 CSV 字段列表,方便 Excel 查看。
|
||||
* **智能分类**: 根据 URL 路径自动将数据归类(如 Match Data, Leetify Rating, Round Data 等)。
|
||||
|
||||
## 🚀 快速开始
|
||||
|
||||
### 1. 运行提取器
|
||||
|
||||
在项目根目录下运行:
|
||||
|
||||
```bash
|
||||
# 使用默认配置 (输入: output_arena, 输出: output_reports/)
|
||||
python utils/json_extractor/main.py
|
||||
|
||||
# 自定义输入输出
|
||||
python utils/json_extractor/main.py --input my_data_folder --output-md my_report.md
|
||||
```
|
||||
|
||||
### 2. 查看报告
|
||||
|
||||
运行完成后,在 `output_reports/` 目录下查看结果:
|
||||
|
||||
* **[schema_summary.md](../../output_reports/schema_summary.md)**: 推荐首先查看此文件,快速了解数据结构。
|
||||
* **[schema_flat.csv](../../output_reports/schema_flat.csv)**: 需要查找特定字段(如 `adr`)在哪些层级出现时使用。
|
||||
|
||||
## 🛠️ 规则配置
|
||||
|
||||
核心规则定义在 `utils/json_extractor/rules.py` 中,你可以根据需要修改:
|
||||
|
||||
* **ID 识别**: 修改 `STEAMID_REGEX` 或 `FIVE_E_ID_REGEX` 正则。
|
||||
* **URL 过滤**: 修改 `IGNORE_URL_PATTERNS` 列表以忽略无关请求(如 sentry 日志)。
|
||||
* **Key 归并**: 修改 `get_key_mask` 函数来添加新的归并逻辑。
|
||||
|
||||
## 📊 结构分析工具
|
||||
|
||||
如果需要深入分析某些结构(如 `fight` 对象的变体),可以使用分析脚本:
|
||||
|
||||
```bash
|
||||
python utils/json_extractor/analyze_structure.py
|
||||
```
|
||||
|
||||
该脚本会统计特定字段的覆盖率,并检查不同 API(如 Round API 与 Leetify API)的共存情况。
|
||||
|
||||
## 📁 目录结构
|
||||
|
||||
```
|
||||
utils/json_extractor/
|
||||
├── extractor.py # 核心提取逻辑 (SchemaExtractor 类)
|
||||
├── main.py # 命令行入口
|
||||
├── rules.py # 正则与归并规则定义
|
||||
├── analyze_structure.py # 结构差异分析辅助脚本
|
||||
└── README.md # 本说明文件
|
||||
```
|
||||
101
utils/json_extractor/analyze_structure.py
Normal file
101
utils/json_extractor/analyze_structure.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
def analyze_structures(root_dir):
|
||||
p = Path(root_dir)
|
||||
files = list(p.rglob("iframe_network.json"))
|
||||
|
||||
fight_keys = set()
|
||||
fight_t_keys = set()
|
||||
fight_ct_keys = set()
|
||||
|
||||
file_categories = defaultdict(set)
|
||||
|
||||
for filepath in files:
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
except:
|
||||
continue
|
||||
|
||||
if not isinstance(data, list):
|
||||
continue
|
||||
|
||||
has_round = False
|
||||
has_leetify = False
|
||||
|
||||
for entry in data:
|
||||
url = entry.get('url', '')
|
||||
body = entry.get('body')
|
||||
|
||||
if "api/match/round/" in url:
|
||||
has_round = True
|
||||
if "api/match/leetify_rating/" in url:
|
||||
has_leetify = True
|
||||
|
||||
# Check for fight structures in data/match
|
||||
if "api/data/match/" in url and isinstance(body, dict):
|
||||
main_data = body.get('data', {})
|
||||
if isinstance(main_data, dict):
|
||||
# Check group_N -> items -> fight/fight_t/fight_ct
|
||||
for k, v in main_data.items():
|
||||
if k.startswith('group_') and isinstance(v, list):
|
||||
for player in v:
|
||||
if isinstance(player, dict):
|
||||
if 'fight' in player and isinstance(player['fight'], dict):
|
||||
fight_keys.update(player['fight'].keys())
|
||||
if 'fight_t' in player and isinstance(player['fight_t'], dict):
|
||||
fight_t_keys.update(player['fight_t'].keys())
|
||||
if 'fight_ct' in player and isinstance(player['fight_ct'], dict):
|
||||
fight_ct_keys.update(player['fight_ct'].keys())
|
||||
|
||||
if has_round:
|
||||
file_categories['round_only'].add(str(filepath))
|
||||
if has_leetify:
|
||||
file_categories['leetify_only'].add(str(filepath))
|
||||
if has_round and has_leetify:
|
||||
file_categories['both'].add(str(filepath))
|
||||
|
||||
print("Structure Analysis Results:")
|
||||
print("-" * 30)
|
||||
print(f"Files with Round API: {len(file_categories['round_only'])}")
|
||||
print(f"Files with Leetify API: {len(file_categories['leetify_only'])}")
|
||||
print(f"Files with BOTH: {len(file_categories['both'])}")
|
||||
|
||||
# Calculate intersections for files
|
||||
round_files = file_categories['round_only']
|
||||
leetify_files = file_categories['leetify_only']
|
||||
intersection = round_files.intersection(leetify_files) # This should be same as 'both' logic above if set correctly, but let's be explicit
|
||||
# Actually my logic above adds to sets independently.
|
||||
|
||||
only_round = round_files - leetify_files
|
||||
only_leetify = leetify_files - round_files
|
||||
both = round_files.intersection(leetify_files)
|
||||
|
||||
print(f"Files with ONLY Round: {len(only_round)}")
|
||||
print(f"Files with ONLY Leetify: {len(only_leetify)}")
|
||||
print(f"Files with BOTH: {len(both)}")
|
||||
|
||||
print("\nFight Structure Analysis:")
|
||||
print("-" * 30)
|
||||
print(f"Fight keys count: {len(fight_keys)}")
|
||||
print(f"Fight_T keys count: {len(fight_t_keys)}")
|
||||
print(f"Fight_CT keys count: {len(fight_ct_keys)}")
|
||||
|
||||
all_keys = fight_keys | fight_t_keys | fight_ct_keys
|
||||
|
||||
missing_in_fight = all_keys - fight_keys
|
||||
missing_in_t = all_keys - fight_t_keys
|
||||
missing_in_ct = all_keys - fight_ct_keys
|
||||
|
||||
if not missing_in_fight and not missing_in_t and not missing_in_ct:
|
||||
print("PERFECT MATCH: fight, fight_t, and fight_ct have identical keys.")
|
||||
else:
|
||||
if missing_in_fight: print(f"Keys missing in 'fight': {missing_in_fight}")
|
||||
if missing_in_t: print(f"Keys missing in 'fight_t': {missing_in_t}")
|
||||
if missing_in_ct: print(f"Keys missing in 'fight_ct': {missing_in_ct}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
analyze_structures("output_arena")
|
||||
243
utils/json_extractor/extractor.py
Normal file
243
utils/json_extractor/extractor.py
Normal file
@@ -0,0 +1,243 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from collections import defaultdict
|
||||
from .rules import is_ignored_url, get_key_mask, get_value_type
|
||||
|
||||
class SchemaExtractor:
|
||||
def __init__(self):
|
||||
# schemas: category -> schema_node
|
||||
self.schemas = {}
|
||||
self.url_counts = defaultdict(int)
|
||||
|
||||
def get_url_category(self, url):
|
||||
"""
|
||||
Derives a category name from the URL.
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path
|
||||
parts = path.strip('/').split('/')
|
||||
cleaned_parts = []
|
||||
for p in parts:
|
||||
# Mask Match IDs (e.g., g161-...)
|
||||
if p.startswith('g161-'):
|
||||
cleaned_parts.append('{match_id}')
|
||||
# Mask other long numeric IDs
|
||||
elif p.isdigit() and len(p) > 4:
|
||||
cleaned_parts.append('{id}')
|
||||
else:
|
||||
cleaned_parts.append(p)
|
||||
|
||||
category = "/".join(cleaned_parts)
|
||||
if not category:
|
||||
category = "root"
|
||||
return category
|
||||
|
||||
def process_directory(self, root_dir):
|
||||
"""
|
||||
Iterates over all iframe_network.json files in the directory.
|
||||
"""
|
||||
p = Path(root_dir)
|
||||
# Use rglob to find all iframe_network.json files
|
||||
files = list(p.rglob("iframe_network.json"))
|
||||
print(f"Found {len(files)} files to process.")
|
||||
|
||||
for i, filepath in enumerate(files):
|
||||
if i % 10 == 0:
|
||||
print(f"Processing {i}/{len(files)}: {filepath}")
|
||||
self.process_file(filepath)
|
||||
|
||||
def process_file(self, filepath):
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
except Exception as e:
|
||||
# print(f"Error reading {filepath}: {e}")
|
||||
return
|
||||
|
||||
if not isinstance(data, list):
|
||||
return
|
||||
|
||||
for entry in data:
|
||||
url = entry.get('url', '')
|
||||
if not url or is_ignored_url(url):
|
||||
continue
|
||||
|
||||
status = entry.get('status')
|
||||
if status != 200:
|
||||
continue
|
||||
|
||||
body = entry.get('body')
|
||||
# Skip empty bodies or bodies that are just empty dicts if that's not useful
|
||||
if not body:
|
||||
continue
|
||||
|
||||
category = self.get_url_category(url)
|
||||
self.url_counts[category] += 1
|
||||
|
||||
if category not in self.schemas:
|
||||
self.schemas[category] = None
|
||||
|
||||
self.schemas[category] = self.merge_value(self.schemas[category], body)
|
||||
|
||||
def merge_value(self, schema, value):
|
||||
"""
|
||||
Merges a value into the existing schema.
|
||||
"""
|
||||
val_type = get_value_type(value)
|
||||
|
||||
if schema is None:
|
||||
schema = {
|
||||
"types": {val_type},
|
||||
"count": 1
|
||||
}
|
||||
else:
|
||||
schema["count"] += 1
|
||||
schema["types"].add(val_type)
|
||||
|
||||
# Handle Dicts
|
||||
if isinstance(value, dict):
|
||||
if "properties" not in schema:
|
||||
schema["properties"] = {}
|
||||
|
||||
for k, v in value.items():
|
||||
masked_key = get_key_mask(k)
|
||||
schema["properties"][masked_key] = self.merge_value(
|
||||
schema["properties"].get(masked_key),
|
||||
v
|
||||
)
|
||||
|
||||
# Handle Lists
|
||||
elif isinstance(value, list):
|
||||
if "items" not in schema:
|
||||
schema["items"] = None
|
||||
|
||||
for item in value:
|
||||
schema["items"] = self.merge_value(schema["items"], item)
|
||||
|
||||
# Handle Primitives (Capture examples if needed, currently just tracking types)
|
||||
else:
|
||||
if "examples" not in schema:
|
||||
schema["examples"] = set()
|
||||
if len(schema["examples"]) < 5:
|
||||
# Store string representation to avoid type issues in set
|
||||
schema["examples"].add(str(value))
|
||||
|
||||
return schema
|
||||
|
||||
def to_serializable(self, schema):
|
||||
"""
|
||||
Converts the internal schema structure (with sets) to a JSON-serializable format.
|
||||
"""
|
||||
if schema is None:
|
||||
return None
|
||||
|
||||
res = {
|
||||
"types": list(sorted(schema["types"])),
|
||||
"count": schema["count"]
|
||||
}
|
||||
|
||||
if "properties" in schema:
|
||||
res["properties"] = {
|
||||
k: self.to_serializable(v)
|
||||
for k, v in sorted(schema["properties"].items())
|
||||
}
|
||||
|
||||
if "items" in schema:
|
||||
res["items"] = self.to_serializable(schema["items"])
|
||||
|
||||
if "examples" in schema:
|
||||
res["examples"] = list(sorted(schema["examples"]))
|
||||
|
||||
return res
|
||||
|
||||
def export_report(self, output_path):
|
||||
report = {}
|
||||
for category, schema in self.schemas.items():
|
||||
report[category] = self.to_serializable(schema)
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(report, f, indent=2, ensure_ascii=False)
|
||||
print(f"Report saved to {output_path}")
|
||||
|
||||
def export_markdown_summary(self, output_path):
|
||||
"""
|
||||
Generates a Markdown summary of the hierarchy.
|
||||
"""
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write("# Schema Hierarchy Report\n\n")
|
||||
|
||||
for category, schema in sorted(self.schemas.items()):
|
||||
f.write(f"## Category: `{category}`\n")
|
||||
f.write(f"**Total Requests**: {self.url_counts[category]}\n\n")
|
||||
|
||||
self._write_markdown_schema(f, schema, level=0)
|
||||
f.write("\n---\n\n")
|
||||
print(f"Markdown summary saved to {output_path}")
|
||||
|
||||
def export_csv_summary(self, output_path):
|
||||
"""
|
||||
Generates a CSV summary of the flattened schema.
|
||||
"""
|
||||
import csv
|
||||
with open(output_path, 'w', encoding='utf-8', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["Category", "Path", "Types", "Examples"])
|
||||
|
||||
for category, schema in sorted(self.schemas.items()):
|
||||
self._write_csv_schema(writer, category, schema, path="")
|
||||
print(f"CSV summary saved to {output_path}")
|
||||
|
||||
def _write_csv_schema(self, writer, category, schema, path):
|
||||
if schema is None:
|
||||
return
|
||||
|
||||
current_types = list(sorted(schema["types"]))
|
||||
type_str = ", ".join(map(str, current_types))
|
||||
|
||||
# If it's a leaf or has no properties/items
|
||||
is_leaf = "properties" not in schema and "items" not in schema
|
||||
|
||||
if is_leaf:
|
||||
examples = list(schema.get("examples", []))
|
||||
ex_str = "; ".join(examples[:3]) if examples else ""
|
||||
writer.writerow([category, path, type_str, ex_str])
|
||||
|
||||
if "properties" in schema:
|
||||
for k, v in schema["properties"].items():
|
||||
new_path = f"{path}.{k}" if path else k
|
||||
self._write_csv_schema(writer, category, v, new_path)
|
||||
|
||||
if "items" in schema:
|
||||
new_path = f"{path}[]"
|
||||
self._write_csv_schema(writer, category, schema["items"], new_path)
|
||||
|
||||
def _write_markdown_schema(self, f, schema, level=0):
|
||||
if schema is None:
|
||||
return
|
||||
|
||||
indent = " " * level
|
||||
types = schema["types"]
|
||||
type_str = ", ".join([str(t) for t in types])
|
||||
|
||||
# If it's a leaf (no props, no items)
|
||||
if "properties" not in schema and "items" not in schema:
|
||||
# Show examples
|
||||
examples = schema.get("examples", [])
|
||||
ex_str = f" (e.g., {', '.join(list(examples)[:3])})" if examples else ""
|
||||
return # We handle leaf printing in the parent loop for keys, or here if it's a root/list item
|
||||
|
||||
if "properties" in schema:
|
||||
for k, v in schema["properties"].items():
|
||||
v_types = ", ".join(list(sorted(v["types"])))
|
||||
v_ex = list(v.get("examples", []))
|
||||
v_ex_str = f", e.g. {v_ex[0]}" if v_ex and "dict" not in v["types"] and "list" not in v["types"] else ""
|
||||
|
||||
f.write(f"{indent}- **{k}** ({v_types}{v_ex_str})\n")
|
||||
self._write_markdown_schema(f, v, level + 1)
|
||||
|
||||
if "items" in schema:
|
||||
f.write(f"{indent}- *[Array Items]*\n")
|
||||
self._write_markdown_schema(f, schema["items"], level + 1)
|
||||
|
||||
35
utils/json_extractor/main.py
Normal file
35
utils/json_extractor/main.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
|
||||
# Add project root to path so we can import utils.json_extractor
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(os.path.dirname(current_dir))
|
||||
sys.path.append(project_root)
|
||||
|
||||
from utils.json_extractor.extractor import SchemaExtractor
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Extract JSON schema from 5E Arena data.")
|
||||
parser.add_argument("--input", default="output_arena", help="Input directory containing iframe_network.json files")
|
||||
parser.add_argument("--output-json", default="output_reports/schema_full.json", help="Output JSON report path")
|
||||
parser.add_argument("--output-md", default="output_reports/schema_summary.md", help="Output Markdown summary path")
|
||||
parser.add_argument("--output-csv", default="output_reports/schema_flat.csv", help="Output CSV flat report path")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Starting extraction from {args.input}...")
|
||||
extractor = SchemaExtractor()
|
||||
extractor.process_directory(args.input)
|
||||
|
||||
# Ensure output directory exists
|
||||
os.makedirs(os.path.dirname(args.output_json), exist_ok=True)
|
||||
os.makedirs(os.path.dirname(args.output_md), exist_ok=True)
|
||||
|
||||
extractor.export_report(args.output_json)
|
||||
extractor.export_markdown_summary(args.output_md)
|
||||
extractor.export_csv_summary(args.output_csv)
|
||||
print("Done.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
81
utils/json_extractor/rules.py
Normal file
81
utils/json_extractor/rules.py
Normal file
@@ -0,0 +1,81 @@
|
||||
import re
|
||||
|
||||
# Regex patterns for masking sensitive/dynamic data
|
||||
STEAMID_REGEX = re.compile(r"^7656\d+$")
|
||||
FIVE_E_ID_REGEX = re.compile(r"^1\d{7}$") # 1 followed by 7 digits (8 digits total)
|
||||
|
||||
# Group merging
|
||||
GROUP_KEY_REGEX = re.compile(r"^group_\d+$")
|
||||
|
||||
# URL Exclusion patterns
|
||||
# We skip these URLs as they are analytics/auth related and not data payload
|
||||
IGNORE_URL_PATTERNS = [
|
||||
r"sentry_key=",
|
||||
r"gate\.5eplay\.com/blacklistfront",
|
||||
r"favicon\.ico",
|
||||
]
|
||||
|
||||
# URL Inclusion/Interest patterns (Optional, if we want to be strict)
|
||||
# INTEREST_URL_PATTERNS = [
|
||||
# r"api/data/match",
|
||||
# r"leetify",
|
||||
# ]
|
||||
|
||||
def is_ignored_url(url):
|
||||
for pattern in IGNORE_URL_PATTERNS:
|
||||
if re.search(pattern, url):
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_key_mask(key):
|
||||
"""
|
||||
Returns a masked key name if it matches a pattern (e.g. group_1 -> group_N).
|
||||
Otherwise returns the key itself.
|
||||
"""
|
||||
if GROUP_KEY_REGEX.match(key):
|
||||
return "group_N"
|
||||
if STEAMID_REGEX.match(key):
|
||||
return "<steamid>"
|
||||
if FIVE_E_ID_REGEX.match(key):
|
||||
return "<5eid>"
|
||||
|
||||
# Merge fight variants
|
||||
if key in ["fight", "fight_t", "fight_ct"]:
|
||||
return "fight_any"
|
||||
|
||||
# Merge numeric keys (likely round numbers)
|
||||
if key.isdigit():
|
||||
return "<round_n>"
|
||||
|
||||
return key
|
||||
|
||||
def get_value_type(value):
|
||||
"""
|
||||
Returns a generalized type string for a value, masking IDs.
|
||||
"""
|
||||
if value is None:
|
||||
return "null"
|
||||
if isinstance(value, bool):
|
||||
return "bool"
|
||||
if isinstance(value, int):
|
||||
# Check for IDs
|
||||
s_val = str(value)
|
||||
if FIVE_E_ID_REGEX.match(s_val):
|
||||
return "<5eid>"
|
||||
if STEAMID_REGEX.match(s_val):
|
||||
return "<steamid>"
|
||||
return "int"
|
||||
if isinstance(value, float):
|
||||
return "float"
|
||||
if isinstance(value, str):
|
||||
if FIVE_E_ID_REGEX.match(value):
|
||||
return "<5eid>"
|
||||
if STEAMID_REGEX.match(value):
|
||||
return "<steamid>"
|
||||
# Heuristic for other IDs or timestamps could go here
|
||||
return "string"
|
||||
if isinstance(value, list):
|
||||
return "list"
|
||||
if isinstance(value, dict):
|
||||
return "dict"
|
||||
return "unknown"
|
||||
Reference in New Issue
Block a user