0.4.4 : L2 ver3 fixed several empty stats in L2.
This commit is contained in:
@@ -1,81 +1,82 @@
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
# 设置pandas显示选项,确保不省略任何行和列
|
||||
pd.set_option('display.max_rows', None)
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.width', 1000)
|
||||
pd.set_option('display.width', 2000)
|
||||
pd.set_option('display.float_format', '{:.2f}'.format)
|
||||
pd.set_option('display.max_colwidth', None)
|
||||
|
||||
db_path = 'database/L2/L2_Main.sqlite'
|
||||
|
||||
def check_nulls_zeros():
|
||||
def check_all_tables():
|
||||
conn = sqlite3.connect(db_path)
|
||||
|
||||
print("=== 1. Fact Match Players: 关键字段零值/空值检查 ===")
|
||||
df_players = pd.read_sql("""
|
||||
SELECT
|
||||
kills, deaths, assists, adr, rating, rating2,
|
||||
kast, awp_kills, flash_duration, jump_count,
|
||||
elo_change
|
||||
FROM fact_match_players
|
||||
""", conn)
|
||||
|
||||
stats = []
|
||||
for col in df_players.columns:
|
||||
total = len(df_players)
|
||||
nulls = df_players[col].isnull().sum()
|
||||
zeros = (df_players[col] == 0).sum()
|
||||
stats.append({
|
||||
'Field': col,
|
||||
'Total': total,
|
||||
'Nulls': nulls,
|
||||
'Null%': (nulls/total)*100,
|
||||
'Zeros': zeros,
|
||||
'Zero%': (zeros/total)*100
|
||||
})
|
||||
print(pd.DataFrame(stats))
|
||||
|
||||
print("\n=== 2. Fact Round Events (Kills): 击杀完整性检查 ===")
|
||||
# 只检查 event_type = 'kill' 的记录
|
||||
df_kills = pd.read_sql("""
|
||||
SELECT
|
||||
attacker_steam_id, victim_steam_id,
|
||||
event_time, weapon,
|
||||
attacker_pos_x, score_change_attacker
|
||||
FROM fact_round_events
|
||||
WHERE event_type = 'kill'
|
||||
""", conn)
|
||||
|
||||
total_kills = len(df_kills)
|
||||
missing_attacker = df_kills['attacker_steam_id'].isnull().sum() + (df_kills['attacker_steam_id'] == '').sum()
|
||||
missing_victim = df_kills['victim_steam_id'].isnull().sum() + (df_kills['victim_steam_id'] == '').sum()
|
||||
|
||||
# 检查 attacker 和 victim 是否相同(自杀)
|
||||
self_kills = (df_kills['attacker_steam_id'] == df_kills['victim_steam_id']).sum()
|
||||
|
||||
print(f"Total Kill Events: {total_kills}")
|
||||
print(f"Missing Attacker: {missing_attacker} ({missing_attacker/total_kills*100:.2f}%)")
|
||||
print(f"Missing Victim: {missing_victim} ({missing_victim/total_kills*100:.2f}%)")
|
||||
print(f"Self Kills (Suicide?): {self_kills}")
|
||||
|
||||
print("\n=== 3. Fact Round Events: 坐标与评分覆盖率 ===")
|
||||
# 坐标应该在 classic 比赛中有值,leetify 比赛中可能为空
|
||||
# 评分应该在 leetify 比赛中有值
|
||||
|
||||
df_events = pd.read_sql("""
|
||||
SELECT
|
||||
m.data_source_type,
|
||||
COUNT(*) as total_events,
|
||||
SUM(CASE WHEN e.attacker_pos_x IS NOT NULL AND e.attacker_pos_x != 0 THEN 1 ELSE 0 END) as has_pos,
|
||||
SUM(CASE WHEN e.score_change_attacker IS NOT NULL AND e.score_change_attacker != 0 THEN 1 ELSE 0 END) as has_score
|
||||
FROM fact_round_events e
|
||||
JOIN fact_matches m ON e.match_id = m.match_id
|
||||
WHERE e.event_type = 'kill'
|
||||
GROUP BY m.data_source_type
|
||||
""", conn)
|
||||
print(df_events)
|
||||
# 获取所有表名
|
||||
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'", conn)['name'].tolist()
|
||||
|
||||
for table in tables:
|
||||
print(f"\n{'='*20} Table: {table} {'='*20}")
|
||||
|
||||
# 获取表的所有列
|
||||
cols_info = pd.read_sql(f"PRAGMA table_info({table})", conn)
|
||||
cols = cols_info['name'].tolist()
|
||||
|
||||
# 读取全表数据
|
||||
df = pd.read_sql(f"SELECT * FROM {table}", conn)
|
||||
total = len(df)
|
||||
|
||||
if total == 0:
|
||||
print(f"Table is empty (0 rows)")
|
||||
continue
|
||||
|
||||
print(f"Total Rows: {total}")
|
||||
print("-" * 60)
|
||||
|
||||
stats = []
|
||||
for col in cols:
|
||||
# 1. Null Check
|
||||
nulls = df[col].isnull().sum()
|
||||
|
||||
# 2. Zero Check (仅对数值型或可转换为数值的列)
|
||||
zeros = 0
|
||||
try:
|
||||
# 尝试转为数值,无法转换的变为NaN
|
||||
numeric_series = pd.to_numeric(df[col], errors='coerce')
|
||||
# 统计0值 (排除原本就是NaN的)
|
||||
zeros = (numeric_series == 0).sum()
|
||||
except:
|
||||
zeros = 0
|
||||
|
||||
# 3. Unique Count (基数)
|
||||
unique_count = df[col].nunique()
|
||||
|
||||
# 4. Example Value (取第一个非空值)
|
||||
example = df[col].dropna().iloc[0] if df[col].count() > 0 else 'ALL NULL'
|
||||
|
||||
stats.append({
|
||||
'Field': col,
|
||||
'Nulls': nulls,
|
||||
'Null%': (nulls/total)*100,
|
||||
'Zeros': zeros,
|
||||
'Zero%': (zeros/total)*100,
|
||||
'Unique': unique_count,
|
||||
'Example': str(example)[:50] # 截断过长示例
|
||||
})
|
||||
|
||||
# 输出完整统计表
|
||||
df_stats = pd.DataFrame(stats)
|
||||
# 按 Zero% 降序排列,但保证 Null% 高的也显眼,这里默认不排序直接按字段序,或者按关注度排序
|
||||
# 用户要求全面探查,按字段原序输出可能更符合直觉,或者按Zero%排序
|
||||
# 这里为了排查问题,按 Zero% 降序输出
|
||||
df_stats = df_stats.sort_values('Zero%', ascending=False)
|
||||
print(df_stats.to_string(index=False))
|
||||
print("\n")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
check_nulls_zeros()
|
||||
check_all_tables()
|
||||
|
||||
BIN
ETL/verify/verify_report.txt
Normal file
BIN
ETL/verify/verify_report.txt
Normal file
Binary file not shown.
Reference in New Issue
Block a user