0.4.4 : L2 ver3 fixed several empty stats in L2.

2026-01-24 02:32:31 +08:00
parent 7ab9df2acf
commit 01451c0b4b
4 changed files with 103 additions and 66 deletions
--- a/ETL/verify/verify_deep.py
+++ b/ETL/verify/verify_deep.py
@@ -1,81 +1,82 @@
 import sqlite3
 import pandas as pd
 import numpy as np
+import sys

+# 设置pandas显示选项，确保不省略任何行和列
+pd.set_option('display.max_rows', None)
 pd.set_option('display.max_columns', None)
-pd.set_option('display.width', 1000)
+pd.set_option('display.width', 2000)
 pd.set_option('display.float_format', '{:.2f}'.format)
+pd.set_option('display.max_colwidth', None)

 db_path = 'database/L2/L2_Main.sqlite'

-def check_nulls_zeros():
+def check_all_tables():
    conn = sqlite3.connect(db_path)
    
-    print("=== 1. Fact Match Players: 关键字段零值/空值检查 ===")
-    df_players = pd.read_sql("""
-        SELECT 
-            kills, deaths, assists, adr, rating, rating2, 
-            kast, awp_kills, flash_duration, jump_count, 
-            elo_change
-        FROM fact_match_players
-    """, conn)
-    
-    stats = []
-    for col in df_players.columns:
-        total = len(df_players)
-        nulls = df_players[col].isnull().sum()
-        zeros = (df_players[col] == 0).sum()
-        stats.append({
-            'Field': col,
-            'Total': total,
-            'Nulls': nulls,
-            'Null%': (nulls/total)*100,
-            'Zeros': zeros,
-            'Zero%': (zeros/total)*100
-        })
-    print(pd.DataFrame(stats))
-
-    print("\n=== 2. Fact Round Events (Kills): 击杀完整性检查 ===")
-    # 只检查 event_type = 'kill' 的记录
-    df_kills = pd.read_sql("""
-        SELECT 
-            attacker_steam_id, victim_steam_id, 
-            event_time, weapon, 
-            attacker_pos_x, score_change_attacker
-        FROM fact_round_events 
-        WHERE event_type = 'kill'
-    """, conn)
-    
-    total_kills = len(df_kills)
-    missing_attacker = df_kills['attacker_steam_id'].isnull().sum() + (df_kills['attacker_steam_id'] == '').sum()
-    missing_victim = df_kills['victim_steam_id'].isnull().sum() + (df_kills['victim_steam_id'] == '').sum()
-    
-    # 检查 attacker 和 victim 是否相同（自杀）
-    self_kills = (df_kills['attacker_steam_id'] == df_kills['victim_steam_id']).sum()
-    
-    print(f"Total Kill Events: {total_kills}")
-    print(f"Missing Attacker: {missing_attacker} ({missing_attacker/total_kills*100:.2f}%)")
-    print(f"Missing Victim: {missing_victim} ({missing_victim/total_kills*100:.2f}%)")
-    print(f"Self Kills (Suicide?): {self_kills}")
-
-    print("\n=== 3. Fact Round Events: 坐标与评分覆盖率 ===")
-    # 坐标应该在 classic 比赛中有值，leetify 比赛中可能为空
-    # 评分应该在 leetify 比赛中有值
-    
-    df_events = pd.read_sql("""
-        SELECT 
-            m.data_source_type,
-            COUNT(*) as total_events,
-            SUM(CASE WHEN e.attacker_pos_x IS NOT NULL AND e.attacker_pos_x != 0 THEN 1 ELSE 0 END) as has_pos,
-            SUM(CASE WHEN e.score_change_attacker IS NOT NULL AND e.score_change_attacker != 0 THEN 1 ELSE 0 END) as has_score
-        FROM fact_round_events e
-        JOIN fact_matches m ON e.match_id = m.match_id
-        WHERE e.event_type = 'kill'
-        GROUP BY m.data_source_type
-    """, conn)
-    print(df_events)
+    # 获取所有表名
+    tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'", conn)['name'].tolist()
    
+    for table in tables:
+        print(f"\n{'='*20} Table: {table} {'='*20}")
+        
+        # 获取表的所有列
+        cols_info = pd.read_sql(f"PRAGMA table_info({table})", conn)
+        cols = cols_info['name'].tolist()
+        
+        # 读取全表数据
+        df = pd.read_sql(f"SELECT * FROM {table}", conn)
+        total = len(df)
+        
+        if total == 0:
+            print(f"Table is empty (0 rows)")
+            continue
+            
+        print(f"Total Rows: {total}")
+        print("-" * 60)
+        
+        stats = []
+        for col in cols:
+            # 1. Null Check
+            nulls = df[col].isnull().sum()
+            
+            # 2. Zero Check (仅对数值型或可转换为数值的列)
+            zeros = 0
+            try:
+                # 尝试转为数值，无法转换的变为NaN
+                numeric_series = pd.to_numeric(df[col], errors='coerce')
+                # 统计0值 (排除原本就是NaN的)
+                zeros = (numeric_series == 0).sum()
+            except:
+                zeros = 0
+                
+            # 3. Unique Count (基数)
+            unique_count = df[col].nunique()
+            
+            # 4. Example Value (取第一个非空值)
+            example = df[col].dropna().iloc[0] if df[col].count() > 0 else 'ALL NULL'
+            
+            stats.append({
+                'Field': col,
+                'Nulls': nulls,
+                'Null%': (nulls/total)*100,
+                'Zeros': zeros,
+                'Zero%': (zeros/total)*100,
+                'Unique': unique_count,
+                'Example': str(example)[:50] # 截断过长示例
+            })
+            
+        # 输出完整统计表
+        df_stats = pd.DataFrame(stats)
+        # 按 Zero% 降序排列，但保证 Null% 高的也显眼，这里默认不排序直接按字段序，或者按关注度排序
+        # 用户要求全面探查，按字段原序输出可能更符合直觉，或者按Zero%排序
+        # 这里为了排查问题，按 Zero% 降序输出
+        df_stats = df_stats.sort_values('Zero%', ascending=False)
+        print(df_stats.to_string(index=False))
+        print("\n")
+        
    conn.close()

 if __name__ == "__main__":
-    check_nulls_zeros()
+    check_all_tables()
--- a/ETL/verify/verify_report.txt
+++ b/ETL/verify/verify_report.txt