feat: Add recent performance stability stats (matches/days) to player profile

2026-01-28 14:04:32 +08:00
commit 48f1f71d3a
104 changed files with 17572 additions and 0 deletions
--- a/ETL/L1A.py
+++ b/ETL/L1A.py
@@ -0,0 +1,102 @@
+"""
+L1A Data Ingestion Script
+
+This script reads raw JSON files from the 'output_arena' directory and ingests them into the SQLite database.
+It supports incremental updates by default, skipping files that have already been processed.
+
+Usage:
+    python ETL/L1A.py           # Standard incremental run
+    python ETL/L1A.py --force   # Force re-process all files (overwrite existing data)
+"""
+
+import os
+
+import json
+import sqlite3
+import glob
+import argparse  # Added
+
+# Paths
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+OUTPUT_ARENA_DIR = os.path.join(BASE_DIR, 'output_arena')
+DB_DIR = os.path.join(BASE_DIR, 'database', 'L1A')
+DB_PATH = os.path.join(DB_DIR, 'L1A.sqlite')
+
+def init_db():
+    if not os.path.exists(DB_DIR):
+        os.makedirs(DB_DIR)
+    
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS raw_iframe_network (
+            match_id TEXT PRIMARY KEY,
+            content TEXT,
+            processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    ''')
+    conn.commit()
+    return conn
+
+def process_files():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--force', action='store_true', help='Force reprocessing of all files')
+    args = parser.parse_args()
+
+    conn = init_db()
+    cursor = conn.cursor()
+    
+    # Get existing match_ids to skip
+    existing_ids = set()
+    if not args.force:
+        try:
+            cursor.execute("SELECT match_id FROM raw_iframe_network")
+            existing_ids = set(row[0] for row in cursor.fetchall())
+            print(f"Found {len(existing_ids)} existing matches in DB. Incremental mode active.")
+        except Exception as e:
+            print(f"Error checking existing data: {e}")
+
+    # Pattern to match all iframe_network.json files
+    # output_arena/*/iframe_network.json
+    pattern = os.path.join(OUTPUT_ARENA_DIR, '*', 'iframe_network.json')
+    files = glob.glob(pattern)
+    
+    print(f"Found {len(files)} files in directory.")
+    
+    count = 0
+    skipped = 0
+    
+    for file_path in files:
+        try:
+            # Extract match_id from directory name
+            # file_path is like .../output_arena/g161-xxx/iframe_network.json
+            parent_dir = os.path.dirname(file_path)
+            match_id = os.path.basename(parent_dir)
+            
+            if match_id in existing_ids:
+                skipped += 1
+                continue
+
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            
+            # Upsert data
+            cursor.execute('''
+                INSERT OR REPLACE INTO raw_iframe_network (match_id, content)
+                VALUES (?, ?)
+            ''', (match_id, content))
+            
+            count += 1
+            if count % 100 == 0:
+                print(f"Processed {count} files...")
+                conn.commit()
+                
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+            
+    conn.commit()
+    conn.close()
+    print(f"Finished. Processed: {count}, Skipped: {skipped}.")
+
+if __name__ == '__main__':
+    process_files()
--- a/ETL/L2_Builder.py
+++ b/ETL/L2_Builder.py
--- a/ETL/L3_Builder.py
+++ b/ETL/L3_Builder.py
@@ -0,0 +1,108 @@
+
+import logging
+import os
+import sys
+
+# Add parent directory to path to allow importing web module
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from web.services.feature_service import FeatureService
+from web.config import Config
+from web.app import create_app
+import sqlite3
+
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+L3_DB_PATH = Config.DB_L3_PATH
+SCHEMA_PATH = os.path.join(Config.BASE_DIR, 'database', 'L3', 'schema.sql')
+
+def _get_existing_columns(conn, table_name):
+    cur = conn.execute(f"PRAGMA table_info({table_name})")
+    return {row[1] for row in cur.fetchall()}
+
+def _ensure_columns(conn, table_name, columns):
+    existing = _get_existing_columns(conn, table_name)
+    for col, col_type in columns.items():
+        if col in existing:
+            continue
+        conn.execute(f"ALTER TABLE {table_name} ADD COLUMN {col} {col_type}")
+
+def init_db():
+    l3_dir = os.path.dirname(L3_DB_PATH)
+    if not os.path.exists(l3_dir):
+        os.makedirs(l3_dir)
+        
+    conn = sqlite3.connect(L3_DB_PATH)
+    with open(SCHEMA_PATH, 'r', encoding='utf-8') as f:
+        conn.executescript(f.read())
+
+    _ensure_columns(
+        conn,
+        "dm_player_features",
+        {
+            "rd_phase_kill_early_share": "REAL",
+            "rd_phase_kill_mid_share": "REAL",
+            "rd_phase_kill_late_share": "REAL",
+            "rd_phase_death_early_share": "REAL",
+            "rd_phase_death_mid_share": "REAL",
+            "rd_phase_death_late_share": "REAL",
+            "rd_phase_kill_early_share_t": "REAL",
+            "rd_phase_kill_mid_share_t": "REAL",
+            "rd_phase_kill_late_share_t": "REAL",
+            "rd_phase_kill_early_share_ct": "REAL",
+            "rd_phase_kill_mid_share_ct": "REAL",
+            "rd_phase_kill_late_share_ct": "REAL",
+            "rd_phase_death_early_share_t": "REAL",
+            "rd_phase_death_mid_share_t": "REAL",
+            "rd_phase_death_late_share_t": "REAL",
+            "rd_phase_death_early_share_ct": "REAL",
+            "rd_phase_death_mid_share_ct": "REAL",
+            "rd_phase_death_late_share_ct": "REAL",
+            "rd_firstdeath_team_first_death_rounds": "INTEGER",
+            "rd_firstdeath_team_first_death_win_rate": "REAL",
+            "rd_invalid_death_rounds": "INTEGER",
+            "rd_invalid_death_rate": "REAL",
+            "rd_pressure_kpr_ratio": "REAL",
+            "rd_pressure_perf_ratio": "REAL",
+            "rd_pressure_rounds_down3": "INTEGER",
+            "rd_pressure_rounds_normal": "INTEGER",
+            "rd_matchpoint_kpr_ratio": "REAL",
+            "rd_matchpoint_perf_ratio": "REAL",
+            "rd_matchpoint_rounds": "INTEGER",
+            "rd_comeback_kill_share": "REAL",
+            "rd_comeback_rounds": "INTEGER",
+            "rd_trade_response_10s_rate": "REAL",
+            "rd_weapon_top_json": "TEXT",
+            "rd_roundtype_split_json": "TEXT",
+            "map_stability_coef": "REAL",
+            "basic_avg_knife_kill": "REAL",
+            "basic_avg_zeus_kill": "REAL",
+            "basic_zeus_pick_rate": "REAL",
+        },
+    )
+
+    conn.commit()
+    conn.close()
+    logger.info("L3 DB Initialized/Updated with Schema.")
+
+def main():
+    logger.info("Starting L3 Builder (Delegating to FeatureService)...")
+    
+    # 1. Ensure Schema is up to date
+    init_db()
+    
+    # 2. Rebuild Features using the centralized logic
+    try:
+        app = create_app()
+        with app.app_context():
+            count = FeatureService.rebuild_all_features()
+            logger.info(f"Successfully rebuilt features for {count} players.")
+    except Exception as e:
+        logger.error(f"Error rebuilding features: {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    main()
--- a/ETL/README.md
+++ b/ETL/README.md
@@ -0,0 +1,23 @@
+# ETL Pipeline Documentation
+
+## 1. L1A (Raw Data Ingestion)
+**Status**: ✅ Supports Incremental Update
+
+This script ingests raw JSON files from `output_arena/` into `database/L1A/L1A.sqlite`.
+
+### Usage
+```bash
+# Standard Run (Incremental)
+# Only processes new files that are not yet in the database.
+python ETL/L1A.py
+
+# Force Refresh
+# Reprocesses ALL files, overwriting existing records.
+python ETL/L1A.py --force
+```
+
+L1B demoparser2 -> L1B.sqlite
+
+L2 L1A.sqlite (+L1b.sqlite) -> L2.sqlite
+
+L3 Deep Dive
--- a/ETL/refresh.py
+++ b/ETL/refresh.py
@@ -0,0 +1,48 @@
+import os
+import sys
+import subprocess
+import time
+
+def run_script(script_path, args=None):
+    cmd = [sys.executable, script_path]
+    if args:
+        cmd.extend(args)
+    
+    print(f"\n[REFRESH] Running: {' '.join(cmd)}")
+    start_time = time.time()
+    
+    result = subprocess.run(cmd)
+    
+    elapsed = time.time() - start_time
+    if result.returncode != 0:
+        print(f"[REFRESH] Error running {script_path}. Exit code: {result.returncode}")
+        sys.exit(result.returncode)
+    else:
+        print(f"[REFRESH] Finished {script_path} in {elapsed:.2f}s")
+
+def main():
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(base_dir)
+    
+    print("="*50)
+    print("STARTING FULL DATABASE REFRESH")
+    print("="*50)
+    
+    # 1. L1A --force (Re-ingest all raw data)
+    l1a_script = os.path.join(base_dir, 'L1A.py')
+    run_script(l1a_script, ['--force'])
+    
+    # 2. L2 Builder (Rebuild Fact Tables with fixed K/D logic)
+    l2_script = os.path.join(base_dir, 'L2_Builder.py')
+    run_script(l2_script)
+    
+    # 3. L3 Builder (Rebuild Feature Store)
+    l3_script = os.path.join(base_dir, 'L3_Builder.py')
+    run_script(l3_script)
+    
+    print("="*50)
+    print("DATABASE REFRESH COMPLETED SUCCESSFULLY")
+    print("="*50)
+
+if __name__ == "__main__":
+    main()
--- a/ETL/verify/L1A_incre_test/clean_dirty_data.py
+++ b/ETL/verify/L1A_incre_test/clean_dirty_data.py
@@ -0,0 +1,39 @@
+import sqlite3
+import os
+
+# 路径指向正式数据库
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+DB_PATH = os.path.join(BASE_DIR, 'database', 'L1A', 'L1A.sqlite')
+
+def clean_db():
+    if not os.path.exists(DB_PATH):
+        print(f"Database not found at {DB_PATH}")
+        return
+
+    print(f"Connecting to production DB: {DB_PATH}")
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+
+    # 查找脏数据 (假设模拟数据的 match_id 是 match_001, match_002, match_003)
+    dirty_ids = ['match_001', 'match_002', 'match_003']
+    
+    # 也可以用 LIKE 'match_%' 如果您想删得更彻底，但要小心误删
+    # 这里我们精准删除
+    
+    deleted_count = 0
+    for mid in dirty_ids:
+        cursor.execute("DELETE FROM raw_iframe_network WHERE match_id = ?", (mid,))
+        if cursor.rowcount > 0:
+            print(f"Deleted dirty record: {mid}")
+            deleted_count += 1
+    
+    conn.commit()
+    conn.close()
+    
+    if deleted_count > 0:
+        print(f"Cleanup complete. Removed {deleted_count} dirty records.")
+    else:
+        print("Cleanup complete. No dirty records found.")
+
+if __name__ == "__main__":
+    clean_db()
--- a/ETL/verify/L1A_incre_test/setup_test_data.py
+++ b/ETL/verify/L1A_incre_test/setup_test_data.py
@@ -0,0 +1,35 @@
+import os
+import json
+
+# 定义路径
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_ROOT = os.path.dirname(os.path.dirname(CURRENT_DIR))
+OUTPUT_ARENA_DIR = os.path.join(PROJECT_ROOT, 'output_arena')
+
+def create_mock_data():
+    if not os.path.exists(OUTPUT_ARENA_DIR):
+        os.makedirs(OUTPUT_ARENA_DIR)
+        print(f"Created directory: {OUTPUT_ARENA_DIR}")
+
+    # 创建 3 个模拟比赛数据
+    mock_matches = ['match_001', 'match_002', 'match_003']
+    
+    for match_id in mock_matches:
+        match_dir = os.path.join(OUTPUT_ARENA_DIR, match_id)
+        if not os.path.exists(match_dir):
+            os.makedirs(match_dir)
+        
+        file_path = os.path.join(match_dir, 'iframe_network.json')
+        if not os.path.exists(file_path):
+            mock_content = {
+                "match_id": match_id,
+                "data": "This is mock data for testing."
+            }
+            with open(file_path, 'w', encoding='utf-8') as f:
+                json.dump(mock_content, f)
+            print(f"Created mock file: {file_path}")
+        else:
+            print(f"File already exists: {file_path}")
+
+if __name__ == "__main__":
+    create_mock_data()
--- a/ETL/verify/L1A_incre_test/test_L1_incremental.py
+++ b/ETL/verify/L1A_incre_test/test_L1_incremental.py
@@ -0,0 +1,76 @@
+import os
+import sqlite3
+import subprocess
+import glob
+
+# 配置路径
+# 当前脚本位于 ETL/verify/ 目录下，需要向上两级找到项目根目录
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_ROOT = os.path.dirname(os.path.dirname(CURRENT_DIR))
+
+L1_SCRIPT = os.path.join(PROJECT_ROOT, 'ETL', 'L1A.py')
+DB_PATH = os.path.join(PROJECT_ROOT, 'database', 'L1A', 'L1A.sqlite')
+OUTPUT_ARENA_DIR = os.path.join(PROJECT_ROOT, 'output_arena')
+
+def get_db_count():
+    """获取数据库中的记录数"""
+    if not os.path.exists(DB_PATH):
+        return 0
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute("SELECT COUNT(*) FROM raw_iframe_network")
+        count = cursor.fetchone()[0]
+        conn.close()
+        return count
+    except Exception:
+        return 0
+
+def get_file_count():
+    """获取源文件总数"""
+    pattern = os.path.join(OUTPUT_ARENA_DIR, '*', 'iframe_network.json')
+    files = glob.glob(pattern)
+    return len(files)
+
+def run_l1_script():
+    """运行 L1 脚本并返回输出"""
+    # 必须在项目根目录下运行，或者正确处理 Python 路径
+    # 这里我们使用绝对路径调用脚本
+    result = subprocess.run(['python', L1_SCRIPT], capture_output=True, text=True)
+    return result.stdout
+
+def main():
+    print("=== 开始 L1 增量逻辑测试 ===")
+    print(f"项目根目录: {PROJECT_ROOT}")
+    
+    # 1. 检查环境
+    total_files = get_file_count()
+    initial_db_count = get_db_count()
+    print(f"[环境] 源文件总数: {total_files}")
+    print(f"[环境] 数据库当前记录数: {initial_db_count}")
+    
+    # 2. 运行脚本 (第一次)
+    print("\n--- 运行 L1A.py (Run 1) ---")
+    output1 = run_l1_script()
+    print(output1.strip())
+    
+    mid_db_count = get_db_count()
+    print(f"[状态] 运行后数据库记录数: {mid_db_count}")
+    
+    if mid_db_count < total_files:
+        print("警告: 数据库记录数少于文件数，可能部分文件处理失败或尚未完成。")
+    
+    # 3. 运行脚本 (第二次 - 验证增量)
+    print("\n--- 再次运行 L1A.py (Run 2 - 验证增量) ---")
+    output2 = run_l1_script()
+    print(output2.strip())
+    
+    # 4. 验证结果
+    expected_msg = f"Skipped: {total_files}"
+    if expected_msg in output2:
+        print("\n✅ 测试通过! 第二次运行跳过了所有文件，增量逻辑生效。")
+    else:
+        print(f"\n❌ 测试未通过。预期输出应包含 '{expected_msg}'")
+
+if __name__ == "__main__":
+    main()
--- a/ETL/verify/L2_verify_report.txt
+++ b/ETL/verify/L2_verify_report.txt
--- a/ETL/verify/verify_L2.py
+++ b/ETL/verify/verify_L2.py
@@ -0,0 +1,504 @@
+import sqlite3
+import pandas as pd
+import csv
+import os
+import sys
+import time
+
+pd.set_option('display.max_columns', None)
+pd.set_option('display.width', 1000)
+
+db_path = 'database/L2/L2_Main.sqlite'
+schema_path = 'database/original_json_schema/schema_flat.csv'
+
+covered_main_fields = {
+    "match_code", "map", "start_time", "end_time", "match_winner",
+    "group1_all_score", "group1_change_elo", "group1_fh_role", "group1_fh_score",
+    "group1_origin_elo", "group1_sh_role", "group1_sh_score", "group1_tid", "group1_uids",
+    "group2_all_score", "group2_change_elo", "group2_fh_role", "group2_fh_score",
+    "group2_origin_elo", "group2_sh_role", "group2_sh_score", "group2_tid", "group2_uids",
+    "server_ip", "server_port", "location", "location_full", "map_desc",
+    "demo_url", "game_mode", "game_name", "match_mode", "match_status", "match_flag",
+    "status", "waiver", "year", "season", "round_total", "cs_type", "priority_show_type",
+    "pug10m_show_type", "credit_match_status", "knife_winner", "knife_winner_role",
+    "most_1v2_uid", "most_assist_uid", "most_awp_uid", "most_end_uid",
+    "most_first_kill_uid", "most_headshot_uid", "most_jump_uid", "mvp_uid", "id"
+}
+covered_user_fields = {
+    "data.group_N[].user_info."
+}
+covered_round_fields = [
+    "data.round_list[].current_score.ct",
+    "data.round_list[].current_score.t",
+    "data.round_list[].current_score.final_round_time",
+    "data.round_list[].all_kill[].pasttime",
+    "data.round_list[].all_kill[].weapon",
+    "data.round_list[].all_kill[].headshot",
+    "data.round_list[].all_kill[].penetrated",
+    "data.round_list[].all_kill[].attackerblind",
+    "data.round_list[].all_kill[].throughsmoke",
+    "data.round_list[].all_kill[].noscope",
+    "data.round_list[].all_kill[].attacker.steamid_64",
+    "data.round_list[].all_kill[].victim.steamid_64",
+    "data.round_list[].all_kill[].attacker.pos.x",
+    "data.round_list[].all_kill[].attacker.pos.y",
+    "data.round_list[].all_kill[].attacker.pos.z",
+    "data.round_list[].all_kill[].victim.pos.x",
+    "data.round_list[].all_kill[].victim.pos.y",
+    "data.round_list[].all_kill[].victim.pos.z"
+]
+covered_leetify_fields = [
+    "data.leetify_data.round_stat[].round",
+    "data.leetify_data.round_stat[].win_reason",
+    "data.leetify_data.round_stat[].end_ts",
+    "data.leetify_data.round_stat[].sfui_event.score_ct",
+    "data.leetify_data.round_stat[].sfui_event.score_t",
+    "data.leetify_data.round_stat[].ct_money_group",
+    "data.leetify_data.round_stat[].t_money_group",
+    "data.leetify_data.round_stat[].show_event[].ts",
+    "data.leetify_data.round_stat[].show_event[].kill_event.Ts",
+    "data.leetify_data.round_stat[].show_event[].kill_event.Killer",
+    "data.leetify_data.round_stat[].show_event[].kill_event.Victim",
+    "data.leetify_data.round_stat[].show_event[].kill_event.WeaponName",
+    "data.leetify_data.round_stat[].show_event[].kill_event.Headshot",
+    "data.leetify_data.round_stat[].show_event[].kill_event.Penetrated",
+    "data.leetify_data.round_stat[].show_event[].kill_event.AttackerBlind",
+    "data.leetify_data.round_stat[].show_event[].kill_event.ThroughSmoke",
+    "data.leetify_data.round_stat[].show_event[].kill_event.NoScope",
+    "data.leetify_data.round_stat[].show_event[].trade_score_change.",
+    "data.leetify_data.round_stat[].show_event[].flash_assist_killer_score_change.",
+    "data.leetify_data.round_stat[].show_event[].killer_score_change.",
+    "data.leetify_data.round_stat[].show_event[].victim_score_change.",
+    "data.leetify_data.round_stat[].bron_equipment.",
+    "data.leetify_data.round_stat[].player_t_score.",
+    "data.leetify_data.round_stat[].player_ct_score.",
+    "data.leetify_data.round_stat[].player_bron_crash."
+]
+covered_vip_fields = {
+    "awp_kill",
+    "awp_kill_ct",
+    "awp_kill_t",
+    "damage_receive",
+    "damage_stats",
+    "fd_ct",
+    "fd_t",
+    "kast"
+}
+
+def load_schema_paths(schema_path_value):
+    paths = []
+    with open(schema_path_value, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        _ = next(reader, None)
+        for row in reader:
+            if len(row) >= 2:
+                paths.append(row[1])
+    return paths
+
+def is_covered(path):
+    if path in ["data", "code", "message", "status", "timestamp", "timeStamp", "traceId", "success", "errcode"]:
+        return True
+    if path.startswith("data.<steamid>."):
+        key = path.split("data.<steamid>.")[1].split(".")[0]
+        if key in covered_vip_fields:
+            return True
+    if "data.group_N[].fight_any." in path:
+        return True
+    if "data.group_N[].fight_t." in path or "data.group_N[].fight_ct." in path:
+        return True
+    if "data.group_N[].sts." in path:
+        return True
+    if "data.group_N[].level_info." in path:
+        return True
+    if "data.treat_info." in path:
+        return True
+    if "data.has_side_data_and_rating2" in path:
+        return True
+    if "data.main." in path:
+        key = path.split("data.main.")[1].split(".")[0]
+        if key in covered_main_fields:
+            return True
+    if any(k in path for k in covered_user_fields):
+        return True
+    if "data.round_list" in path:
+        return True
+    if any(k in path for k in covered_round_fields):
+        return True
+    if "data.leetify_data." in path:
+        return True
+    if any(k in path for k in covered_leetify_fields):
+        return True
+    return False
+
+def group_key(p):
+    if "data.group_N[].user_info." in p:
+        return "data.group_N[].user_info.*"
+    if "data.group_N[].fight_any." in p:
+        return "data.group_N[].fight_any.*"
+    if "data.group_N[].fight_t." in p:
+        return "data.group_N[].fight_t.*"
+    if "data.group_N[].fight_ct." in p:
+        return "data.group_N[].fight_ct.*"
+    if "data.main." in p:
+        return "data.main.*"
+    if "data.round_list[]" in p or "data.round_list[]." in p:
+        return "data.round_list.*"
+    if "data.leetify_data.round_stat[]" in p or "data.leetify_data.round_stat[]." in p:
+        return "data.leetify_data.round_stat.*"
+    if "data.leetify_data." in p:
+        return "data.leetify_data.*"
+    if "data.treat_info." in p:
+        return "data.treat_info.*"
+    if "data." in p:
+        return "data.*"
+    return "other"
+
+def dump_uncovered(output_path):
+    paths = load_schema_paths(schema_path)
+    uncovered = [p for p in paths if not is_covered(p)]
+    df_unc = pd.DataFrame({"path": uncovered})
+    if len(df_unc) == 0:
+        print("no uncovered paths")
+        return
+    df_unc["group"] = df_unc["path"].apply(group_key)
+    df_unc = df_unc.sort_values(["group", "path"])
+    df_unc.to_csv(output_path, index=False, encoding='utf-8-sig')
+    print(f"uncovered total: {len(df_unc)}")
+    print("\n-- uncovered groups (count) --")
+    print(df_unc.groupby("group").size().sort_values(ascending=False))
+    print(f"\noutput: {output_path}")
+
+def print_schema(conn):
+    tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name").fetchall()
+    for (name,) in tables:
+        print(f"\n[{name}]")
+        cols = conn.execute(f"PRAGMA table_info({name})").fetchall()
+        rows = [["column", "type", "pk"]]
+        for _, col_name, col_type, _, _, pk in cols:
+            rows.append([col_name, col_type or "", str(pk)])
+        widths = [max(len(r[i]) for r in rows) for i in range(3)]
+        for idx, r in enumerate(rows):
+            line = " | ".join([r[i].ljust(widths[i]) for i in range(3)])
+            print(line)
+            if idx == 0:
+                print("-" * len(line))
+
+def refresh_schema_sql(conn, output_path):
+    rows = conn.execute("""
+        SELECT type, name, sql
+        FROM sqlite_master
+        WHERE sql IS NOT NULL AND type IN ('table', 'index') AND name NOT LIKE 'sqlite_%'
+        ORDER BY CASE WHEN type='table' THEN 0 ELSE 1 END, name
+    """).fetchall()
+    lines = ["PRAGMA foreign_keys = ON;", ""]
+    for _, _, sql in rows:
+        lines.append(sql.strip() + ";")
+        lines.append("")
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write("\n".join(lines).strip() + "\n")
+
+def verify():
+    conn = sqlite3.connect(db_path)
+    
+    print("--- Counts ---")
+    tables = [
+        'dim_players',
+        'dim_maps',
+        'fact_matches',
+        'fact_match_teams',
+        'fact_match_players',
+        'fact_match_players_t',
+        'fact_match_players_ct',
+        'fact_rounds',
+        'fact_round_events',
+        'fact_round_player_economy'
+    ]
+    for t in tables:
+        count = conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0]
+        print(f"{t}: {count}")
+
+    print("\n--- Data Source Distribution ---")
+    dist = pd.read_sql("SELECT data_source_type, COUNT(*) as cnt FROM fact_matches GROUP BY data_source_type", conn)
+    print(dist)
+    
+    print("\n--- Sample Round Events (Leetify vs Classic) ---")
+    # Fetch one event from a leetify match
+    leetify_match = conn.execute("SELECT match_id FROM fact_matches WHERE data_source_type='leetify' LIMIT 1").fetchone()
+    if leetify_match:
+        mid = leetify_match[0]
+        print(f"Leetify Match: {mid}")
+        df = pd.read_sql(f"SELECT * FROM fact_round_events WHERE match_id='{mid}' AND event_type='kill' LIMIT 1", conn)
+        print(df[['event_type', 'attacker_steam_id', 'trade_killer_steam_id', 'attacker_pos_x', 'score_change_attacker']])
+    
+    # Fetch one event from a classic match
+    classic_match = conn.execute("SELECT match_id FROM fact_matches WHERE data_source_type='classic' LIMIT 1").fetchone()
+    if classic_match:
+        mid = classic_match[0]
+        print(f"Classic Match: {mid}")
+        df = pd.read_sql(f"SELECT * FROM fact_round_events WHERE match_id='{mid}' AND event_type='kill' LIMIT 1", conn)
+        print(df[['event_type', 'attacker_steam_id', 'trade_killer_steam_id', 'attacker_pos_x', 'score_change_attacker']])
+
+    print("\n--- Sample Player Stats (New Fields) ---")
+    df_players = pd.read_sql("SELECT steam_id_64, rating, rating3, elo_change, rank_score, flash_duration, jump_count FROM fact_match_players LIMIT 5", conn)
+    print(df_players)
+    
+    print("\n--- Insert Field Checks ---")
+    meta_counts = conn.execute("""
+        SELECT 
+            SUM(CASE WHEN response_code IS NOT NULL THEN 1 ELSE 0 END) AS response_code_cnt,
+            SUM(CASE WHEN response_trace_id IS NOT NULL AND response_trace_id != '' THEN 1 ELSE 0 END) AS response_trace_id_cnt,
+            SUM(CASE WHEN response_success IS NOT NULL THEN 1 ELSE 0 END) AS response_success_cnt,
+            SUM(CASE WHEN response_errcode IS NOT NULL THEN 1 ELSE 0 END) AS response_errcode_cnt,
+            SUM(CASE WHEN treat_info_raw IS NOT NULL AND treat_info_raw != '' THEN 1 ELSE 0 END) AS treat_info_raw_cnt,
+            SUM(CASE WHEN round_list_raw IS NOT NULL AND round_list_raw != '' THEN 1 ELSE 0 END) AS round_list_raw_cnt,
+            SUM(CASE WHEN leetify_data_raw IS NOT NULL AND leetify_data_raw != '' THEN 1 ELSE 0 END) AS leetify_data_raw_cnt
+        FROM fact_matches
+    """).fetchone()
+    print(f"response_code non-null: {meta_counts[0]}")
+    print(f"response_trace_id non-empty: {meta_counts[1]}")
+    print(f"response_success non-null: {meta_counts[2]}")
+    print(f"response_errcode non-null: {meta_counts[3]}")
+    print(f"treat_info_raw non-empty: {meta_counts[4]}")
+    print(f"round_list_raw non-empty: {meta_counts[5]}")
+    print(f"leetify_data_raw non-empty: {meta_counts[6]}")
+
+    print("\n--- Integrity Checks ---")
+    missing_players = conn.execute("""
+        SELECT COUNT(*) FROM fact_match_players f
+        LEFT JOIN dim_players d ON f.steam_id_64 = d.steam_id_64
+        WHERE d.steam_id_64 IS NULL
+    """).fetchone()[0]
+    print(f"fact_match_players missing dim_players: {missing_players}")
+
+    missing_round_matches = conn.execute("""
+        SELECT COUNT(*) FROM fact_rounds r
+        LEFT JOIN fact_matches m ON r.match_id = m.match_id
+        WHERE m.match_id IS NULL
+    """).fetchone()[0]
+    print(f"fact_rounds missing fact_matches: {missing_round_matches}")
+
+    missing_event_rounds = conn.execute("""
+        SELECT COUNT(*) FROM fact_round_events e
+        LEFT JOIN fact_rounds r ON e.match_id = r.match_id AND e.round_num = r.round_num
+        WHERE r.match_id IS NULL
+    """).fetchone()[0]
+    print(f"fact_round_events missing fact_rounds: {missing_event_rounds}")
+
+    side_zero_t = conn.execute("""
+        SELECT COUNT(*) FROM fact_match_players_t
+        WHERE COALESCE(kills,0)=0 AND COALESCE(deaths,0)=0 AND COALESCE(assists,0)=0
+    """).fetchone()[0]
+    side_zero_ct = conn.execute("""
+        SELECT COUNT(*) FROM fact_match_players_ct
+        WHERE COALESCE(kills,0)=0 AND COALESCE(deaths,0)=0 AND COALESCE(assists,0)=0
+    """).fetchone()[0]
+    print(f"fact_match_players_t zero K/D/A: {side_zero_t}")
+    print(f"fact_match_players_ct zero K/D/A: {side_zero_ct}")
+
+    print("\n--- Full vs T/CT Comparison ---")
+    cols = [
+        'kills', 'deaths', 'assists', 'headshot_count', 'adr', 'rating', 'rating2',
+        'rating3', 'rws', 'mvp_count', 'flash_duration', 'jump_count', 'is_win'
+    ]
+    df_full = pd.read_sql(
+        "SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players",
+        conn
+    )
+    df_t = pd.read_sql(
+        "SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players_t",
+        conn
+    ).rename(columns={c: f"{c}_t" for c in cols})
+    df_ct = pd.read_sql(
+        "SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players_ct",
+        conn
+    ).rename(columns={c: f"{c}_ct" for c in cols})
+
+    df = df_full.merge(df_t, on=['match_id', 'steam_id_64'], how='left')
+    df = df.merge(df_ct, on=['match_id', 'steam_id_64'], how='left')
+
+    def is_empty(s):
+        return s.isna() | (s == 0)
+
+    for c in cols:
+        empty_count = is_empty(df[c]).sum()
+        print(f"{c} empty: {empty_count}")
+
+    additive = ['kills', 'deaths', 'assists', 'headshot_count', 'mvp_count', 'flash_duration', 'jump_count']
+    for c in additive:
+        t_sum = df[f"{c}_t"].fillna(0) + df[f"{c}_ct"].fillna(0)
+        tol = 0.01 if c == 'flash_duration' else 0
+        diff = (df[c].fillna(0) - t_sum).abs() > tol
+        print(f"{c} full != t+ct: {diff.sum()}")
+
+    non_additive = ['adr', 'rating', 'rating2', 'rating3', 'rws', 'is_win']
+    for c in non_additive:
+        side_nonempty = (~is_empty(df[f"{c}_t"])) | (~is_empty(df[f"{c}_ct"]))
+        full_empty_side_nonempty = is_empty(df[c]) & side_nonempty
+        full_nonempty_side_empty = (~is_empty(df[c])) & (~side_nonempty)
+        print(f"{c} full empty but side has: {full_empty_side_nonempty.sum()}")
+        print(f"{c} full has but side empty: {full_nonempty_side_empty.sum()}")
+
+    print("\n--- Rating Detail ---")
+    rating_cols = ['rating', 'rating2', 'rating3']
+    for c in rating_cols:
+        full_null = df[c].isna().sum()
+        full_zero = (df[c] == 0).sum()
+        full_nonzero = ((~df[c].isna()) & (df[c] != 0)).sum()
+        side_t_nonzero = ((~df[f"{c}_t"].isna()) & (df[f"{c}_t"] != 0)).sum()
+        side_ct_nonzero = ((~df[f"{c}_ct"].isna()) & (df[f"{c}_ct"] != 0)).sum()
+        side_any_nonzero = ((~df[f"{c}_t"].isna()) & (df[f"{c}_t"] != 0)) | ((~df[f"{c}_ct"].isna()) & (df[f"{c}_ct"] != 0))
+        full_nonzero_side_zero = ((~df[c].isna()) & (df[c] != 0) & (~side_any_nonzero)).sum()
+        full_zero_side_nonzero = (((df[c].isna()) | (df[c] == 0)) & side_any_nonzero).sum()
+        print(f"{c} full null: {full_null} full zero: {full_zero} full nonzero: {full_nonzero}")
+        print(f"{c} side t nonzero: {side_t_nonzero} side ct nonzero: {side_ct_nonzero}")
+        print(f"{c} full nonzero but side all zero: {full_nonzero_side_zero}")
+        print(f"{c} full zero but side has: {full_zero_side_nonzero}")
+
+    df_rating_src = pd.read_sql(
+        "SELECT f.rating, f.rating2, f.rating3, m.data_source_type FROM fact_match_players f JOIN fact_matches m ON f.match_id = m.match_id",
+        conn
+    )
+    for c in rating_cols:
+        grp = df_rating_src.groupby('data_source_type')[c].apply(lambda s: (s != 0).sum()).reset_index(name='nonzero')
+        print(f"{c} nonzero by source")
+        print(grp)
+
+    print("\n--- Schema Coverage (fight_any) ---")
+    paths = load_schema_paths(schema_path)
+    fight_keys = set()
+    for p in paths:
+        if 'data.group_N[].fight_any.' in p:
+            key = p.split('fight_any.')[1].split('.')[0]
+            fight_keys.add(key)
+    l2_cols = set(pd.read_sql("PRAGMA table_info(fact_match_players)", conn)['name'].tolist())
+    alias = {
+        'kills': 'kill',
+        'deaths': 'death',
+        'assists': 'assist',
+        'headshot_count': 'headshot',
+        'mvp_count': 'is_mvp',
+        'flash_duration': 'flash_enemy_time',
+        'jump_count': 'jump_total',
+        'awp_kills': 'awp_kill'
+    }
+    covered = set()
+    for c in l2_cols:
+        if c in fight_keys:
+            covered.add(c)
+        elif c in alias and alias[c] in fight_keys:
+            covered.add(alias[c])
+    missing_keys = sorted(list(fight_keys - covered))
+    print(f"fight_any keys: {len(fight_keys)}")
+    print(f"covered by L2 columns: {len(covered)}")
+    print(f"uncovered fight_any keys: {len(missing_keys)}")
+    if missing_keys:
+        print(missing_keys)
+
+    print("\n--- Coverage Zero Rate (fight_any -> fact_match_players) ---")
+    fight_cols = [k for k in fight_keys if k in l2_cols or k in alias.values()]
+    col_map = {}
+    for k in fight_cols:
+        if k in l2_cols:
+            col_map[k] = k
+        else:
+            for l2k, src in alias.items():
+                if src == k:
+                    col_map[k] = l2k
+                    break
+    select_cols = ["steam_id_64"] + list(set(col_map.values()))
+    df_fight = pd.read_sql(
+        "SELECT " + ",".join(select_cols) + " FROM fact_match_players",
+        conn
+    )
+    total_rows = len(df_fight)
+    stats = []
+    for fight_key, col in sorted(col_map.items()):
+        s = df_fight[col]
+        zeros = (s == 0).sum()
+        nulls = s.isna().sum()
+        nonzero = total_rows - zeros - nulls
+        stats.append({
+            "fight_key": fight_key,
+            "column": col,
+            "nonzero": nonzero,
+            "zero": zeros,
+            "null": nulls,
+            "zero_rate": 0 if total_rows == 0 else round(zeros / total_rows, 4)
+        })
+    df_stats = pd.DataFrame(stats).sort_values(["zero_rate", "nonzero"], ascending=[False, True])
+    print(df_stats.head(30))
+    print("\n-- zero_rate top (most zeros) --")
+    print(df_stats.head(10))
+    print("\n-- zero_rate bottom (most nonzero) --")
+    print(df_stats.tail(10))
+
+    print("\n--- Schema Coverage (leetify economy) ---")
+    econ_keys = [
+        'data.leetify_data.round_stat[].bron_equipment.',
+        'data.leetify_data.round_stat[].player_t_score.',
+        'data.leetify_data.round_stat[].player_ct_score.',
+        'data.leetify_data.round_stat[].player_bron_crash.'
+    ]
+    for k in econ_keys:
+        count = sum(1 for p in paths if k in p)
+        print(f"{k} paths: {count}")
+
+    print("\n--- Schema Summary Coverage (by path groups) ---")
+    uncovered = [p for p in paths if not is_covered(p)]
+    print(f"total paths: {len(paths)}")
+    print(f"covered paths: {len(paths) - len(uncovered)}")
+    print(f"uncovered paths: {len(uncovered)}")
+
+    df_unc = pd.DataFrame({"path": uncovered})
+    if len(df_unc) > 0:
+        df_unc["group"] = df_unc["path"].apply(group_key)
+        print("\n-- Uncovered groups (count) --")
+        print(df_unc.groupby("group").size().sort_values(ascending=False))
+        print("\n-- Uncovered examples (top 50) --")
+        print(df_unc["path"].head(50).to_list())
+
+    conn.close()
+
+def watch_schema(schema_path, interval=1.0):
+    last_db_mtime = 0
+    last_schema_mtime = 0
+    first = True
+    while True:
+        if not os.path.exists(db_path):
+            print(f"db not found: {db_path}")
+            time.sleep(interval)
+            continue
+        db_mtime = os.path.getmtime(db_path)
+        schema_mtime = os.path.getmtime(schema_path) if os.path.exists(schema_path) else 0
+        if first or db_mtime > last_db_mtime or schema_mtime > last_schema_mtime:
+            conn = sqlite3.connect(db_path)
+            refresh_schema_sql(conn, schema_path)
+            print(f"\n[{time.strftime('%Y-%m-%d %H:%M:%S')}] schema.sql refreshed")
+            print_schema(conn)
+            conn.close()
+            last_db_mtime = db_mtime
+            last_schema_mtime = os.path.getmtime(schema_path) if os.path.exists(schema_path) else 0
+            first = False
+        time.sleep(interval)
+
+if __name__ == "__main__":
+    args = [a.lower() for a in sys.argv[1:]]
+    if "dump_uncovered" in args or "uncovered" in args:
+        dump_uncovered('database/original_json_schema/uncovered_features.csv')
+    elif "watch_schema" in args or "watch" in args:
+        try:
+            watch_schema('database/L2/schema.sql')
+        except KeyboardInterrupt:
+            pass
+    elif "schema" in args or "refresh_schema" in args:
+        if not os.path.exists(db_path):
+            print(f"db not found: {db_path}")
+        else:
+            conn = sqlite3.connect(db_path)
+            if "refresh_schema" in args:
+                refresh_schema_sql(conn, 'database/L2/schema.sql')
+                print("schema.sql refreshed")
+            print_schema(conn)
+            conn.close()
+    else:
+        verify()
--- a/ETL/verify/verify_L3.py
+++ b/ETL/verify/verify_L3.py
@@ -0,0 +1,29 @@
+
+import sqlite3
+import pandas as pd
+
+L3_DB_PATH = 'database/L3/L3_Features.sqlite'
+
+def verify():
+    conn = sqlite3.connect(L3_DB_PATH)
+    
+    # 1. Row count
+    cursor = conn.cursor()
+    cursor.execute("SELECT COUNT(*) FROM dm_player_features")
+    count = cursor.fetchone()[0]
+    print(f"Total Players in L3: {count}")
+    
+    # 2. Sample Data
+    df = pd.read_sql_query("SELECT * FROM dm_player_features LIMIT 5", conn)
+    print("\nSample Data (First 5 rows):")
+    print(df[['steam_id_64', 'total_matches', 'basic_avg_rating', 'sta_last_30_rating', 'bat_kd_diff_high_elo', 'hps_clutch_win_rate_1v1']].to_string())
+    
+    # 3. Stats Summary
+    print("\nStats Summary:")
+    full_df = pd.read_sql_query("SELECT basic_avg_rating, sta_last_30_rating, bat_win_rate_vs_all FROM dm_player_features", conn)
+    print(full_df.describe())
+    
+    conn.close()
+
+if __name__ == "__main__":
+    verify()
--- a/ETL/verify/verify_deep.py
+++ b/ETL/verify/verify_deep.py
@@ -0,0 +1,82 @@
+import sqlite3
+import pandas as pd
+import numpy as np
+import sys
+
+# 设置pandas显示选项，确保不省略任何行和列
+pd.set_option('display.max_rows', None)
+pd.set_option('display.max_columns', None)
+pd.set_option('display.width', 2000)
+pd.set_option('display.float_format', '{:.2f}'.format)
+pd.set_option('display.max_colwidth', None)
+
+db_path = 'database/L2/L2_Main.sqlite'
+
+def check_all_tables():
+    conn = sqlite3.connect(db_path)
+    
+    # 获取所有表名
+    tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'", conn)['name'].tolist()
+    
+    for table in tables:
+        print(f"\n{'='*20} Table: {table} {'='*20}")
+        
+        # 获取表的所有列
+        cols_info = pd.read_sql(f"PRAGMA table_info({table})", conn)
+        cols = cols_info['name'].tolist()
+        
+        # 读取全表数据
+        df = pd.read_sql(f"SELECT * FROM {table}", conn)
+        total = len(df)
+        
+        if total == 0:
+            print(f"Table is empty (0 rows)")
+            continue
+            
+        print(f"Total Rows: {total}")
+        print("-" * 60)
+        
+        stats = []
+        for col in cols:
+            # 1. Null Check
+            nulls = df[col].isnull().sum()
+            
+            # 2. Zero Check (仅对数值型或可转换为数值的列)
+            zeros = 0
+            try:
+                # 尝试转为数值，无法转换的变为NaN
+                numeric_series = pd.to_numeric(df[col], errors='coerce')
+                # 统计0值 (排除原本就是NaN的)
+                zeros = (numeric_series == 0).sum()
+            except:
+                zeros = 0
+                
+            # 3. Unique Count (基数)
+            unique_count = df[col].nunique()
+            
+            # 4. Example Value (取第一个非空值)
+            example = df[col].dropna().iloc[0] if df[col].count() > 0 else 'ALL NULL'
+            
+            stats.append({
+                'Field': col,
+                'Nulls': nulls,
+                'Null%': (nulls/total)*100,
+                'Zeros': zeros,
+                'Zero%': (zeros/total)*100,
+                'Unique': unique_count,
+                'Example': str(example)[:50] # 截断过长示例
+            })
+            
+        # 输出完整统计表
+        df_stats = pd.DataFrame(stats)
+        # 按 Zero% 降序排列，但保证 Null% 高的也显眼，这里默认不排序直接按字段序，或者按关注度排序
+        # 用户要求全面探查，按字段原序输出可能更符合直觉，或者按Zero%排序
+        # 这里为了排查问题，按 Zero% 降序输出
+        df_stats = df_stats.sort_values('Zero%', ascending=False)
+        print(df_stats.to_string(index=False))
+        print("\n")
+        
+    conn.close()
+
+if __name__ == "__main__":
+    check_all_tables()