0.4 : L2 ver1 finished
This commit is contained in:
245
ETL/verify_L2.py
Normal file
245
ETL/verify_L2.py
Normal file
@@ -0,0 +1,245 @@
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
import csv
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.width', 1000)
|
||||
|
||||
db_path = 'database/L2/L2_Main.sqlite'
|
||||
|
||||
def verify():
|
||||
conn = sqlite3.connect(db_path)
|
||||
|
||||
print("--- Counts ---")
|
||||
tables = [
|
||||
'dim_players',
|
||||
'dim_maps',
|
||||
'fact_matches',
|
||||
'fact_match_players',
|
||||
'fact_match_players_t',
|
||||
'fact_match_players_ct',
|
||||
'fact_rounds',
|
||||
'fact_round_events',
|
||||
'fact_round_player_economy'
|
||||
]
|
||||
for t in tables:
|
||||
count = conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0]
|
||||
print(f"{t}: {count}")
|
||||
|
||||
print("\n--- Data Source Distribution ---")
|
||||
dist = pd.read_sql("SELECT data_source_type, COUNT(*) as cnt FROM fact_matches GROUP BY data_source_type", conn)
|
||||
print(dist)
|
||||
|
||||
print("\n--- Sample Round Events (Leetify vs Classic) ---")
|
||||
# Fetch one event from a leetify match
|
||||
leetify_match = conn.execute("SELECT match_id FROM fact_matches WHERE data_source_type='leetify' LIMIT 1").fetchone()
|
||||
if leetify_match:
|
||||
mid = leetify_match[0]
|
||||
print(f"Leetify Match: {mid}")
|
||||
df = pd.read_sql(f"SELECT * FROM fact_round_events WHERE match_id='{mid}' AND event_type='kill' LIMIT 1", conn)
|
||||
print(df[['event_type', 'attacker_steam_id', 'trade_killer_steam_id', 'attacker_pos_x', 'score_change_attacker']])
|
||||
|
||||
# Fetch one event from a classic match
|
||||
classic_match = conn.execute("SELECT match_id FROM fact_matches WHERE data_source_type='classic' LIMIT 1").fetchone()
|
||||
if classic_match:
|
||||
mid = classic_match[0]
|
||||
print(f"Classic Match: {mid}")
|
||||
df = pd.read_sql(f"SELECT * FROM fact_round_events WHERE match_id='{mid}' AND event_type='kill' LIMIT 1", conn)
|
||||
print(df[['event_type', 'attacker_steam_id', 'trade_killer_steam_id', 'attacker_pos_x', 'score_change_attacker']])
|
||||
|
||||
print("\n--- Sample Player Stats (New Fields) ---")
|
||||
df_players = pd.read_sql("SELECT steam_id_64, rating, rating3, elo_change, rank_score, flash_duration, jump_count FROM fact_match_players LIMIT 5", conn)
|
||||
print(df_players)
|
||||
|
||||
print("\n--- Integrity Checks ---")
|
||||
missing_players = conn.execute("""
|
||||
SELECT COUNT(*) FROM fact_match_players f
|
||||
LEFT JOIN dim_players d ON f.steam_id_64 = d.steam_id_64
|
||||
WHERE d.steam_id_64 IS NULL
|
||||
""").fetchone()[0]
|
||||
print(f"fact_match_players missing dim_players: {missing_players}")
|
||||
|
||||
missing_round_matches = conn.execute("""
|
||||
SELECT COUNT(*) FROM fact_rounds r
|
||||
LEFT JOIN fact_matches m ON r.match_id = m.match_id
|
||||
WHERE m.match_id IS NULL
|
||||
""").fetchone()[0]
|
||||
print(f"fact_rounds missing fact_matches: {missing_round_matches}")
|
||||
|
||||
missing_event_rounds = conn.execute("""
|
||||
SELECT COUNT(*) FROM fact_round_events e
|
||||
LEFT JOIN fact_rounds r ON e.match_id = r.match_id AND e.round_num = r.round_num
|
||||
WHERE r.match_id IS NULL
|
||||
""").fetchone()[0]
|
||||
print(f"fact_round_events missing fact_rounds: {missing_event_rounds}")
|
||||
|
||||
side_zero_t = conn.execute("""
|
||||
SELECT COUNT(*) FROM fact_match_players_t
|
||||
WHERE COALESCE(kills,0)=0 AND COALESCE(deaths,0)=0 AND COALESCE(assists,0)=0
|
||||
""").fetchone()[0]
|
||||
side_zero_ct = conn.execute("""
|
||||
SELECT COUNT(*) FROM fact_match_players_ct
|
||||
WHERE COALESCE(kills,0)=0 AND COALESCE(deaths,0)=0 AND COALESCE(assists,0)=0
|
||||
""").fetchone()[0]
|
||||
print(f"fact_match_players_t zero K/D/A: {side_zero_t}")
|
||||
print(f"fact_match_players_ct zero K/D/A: {side_zero_ct}")
|
||||
|
||||
print("\n--- Full vs T/CT Comparison ---")
|
||||
cols = [
|
||||
'kills', 'deaths', 'assists', 'headshot_count', 'adr', 'rating', 'rating2',
|
||||
'rating3', 'rws', 'mvp_count', 'flash_duration', 'jump_count', 'is_win'
|
||||
]
|
||||
df_full = pd.read_sql(
|
||||
"SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players",
|
||||
conn
|
||||
)
|
||||
df_t = pd.read_sql(
|
||||
"SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players_t",
|
||||
conn
|
||||
).rename(columns={c: f"{c}_t" for c in cols})
|
||||
df_ct = pd.read_sql(
|
||||
"SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players_ct",
|
||||
conn
|
||||
).rename(columns={c: f"{c}_ct" for c in cols})
|
||||
|
||||
df = df_full.merge(df_t, on=['match_id', 'steam_id_64'], how='left')
|
||||
df = df.merge(df_ct, on=['match_id', 'steam_id_64'], how='left')
|
||||
|
||||
def is_empty(s):
|
||||
return s.isna() | (s == 0)
|
||||
|
||||
for c in cols:
|
||||
empty_count = is_empty(df[c]).sum()
|
||||
print(f"{c} empty: {empty_count}")
|
||||
|
||||
additive = ['kills', 'deaths', 'assists', 'headshot_count', 'mvp_count', 'flash_duration', 'jump_count']
|
||||
for c in additive:
|
||||
t_sum = df[f"{c}_t"].fillna(0) + df[f"{c}_ct"].fillna(0)
|
||||
tol = 0.01 if c == 'flash_duration' else 0
|
||||
diff = (df[c].fillna(0) - t_sum).abs() > tol
|
||||
print(f"{c} full != t+ct: {diff.sum()}")
|
||||
|
||||
non_additive = ['adr', 'rating', 'rating2', 'rating3', 'rws', 'is_win']
|
||||
for c in non_additive:
|
||||
side_nonempty = (~is_empty(df[f"{c}_t"])) | (~is_empty(df[f"{c}_ct"]))
|
||||
full_empty_side_nonempty = is_empty(df[c]) & side_nonempty
|
||||
full_nonempty_side_empty = (~is_empty(df[c])) & (~side_nonempty)
|
||||
print(f"{c} full empty but side has: {full_empty_side_nonempty.sum()}")
|
||||
print(f"{c} full has but side empty: {full_nonempty_side_empty.sum()}")
|
||||
|
||||
print("\n--- Rating Detail ---")
|
||||
rating_cols = ['rating', 'rating2', 'rating3']
|
||||
for c in rating_cols:
|
||||
full_null = df[c].isna().sum()
|
||||
full_zero = (df[c] == 0).sum()
|
||||
full_nonzero = ((~df[c].isna()) & (df[c] != 0)).sum()
|
||||
side_t_nonzero = ((~df[f"{c}_t"].isna()) & (df[f"{c}_t"] != 0)).sum()
|
||||
side_ct_nonzero = ((~df[f"{c}_ct"].isna()) & (df[f"{c}_ct"] != 0)).sum()
|
||||
side_any_nonzero = ((~df[f"{c}_t"].isna()) & (df[f"{c}_t"] != 0)) | ((~df[f"{c}_ct"].isna()) & (df[f"{c}_ct"] != 0))
|
||||
full_nonzero_side_zero = ((~df[c].isna()) & (df[c] != 0) & (~side_any_nonzero)).sum()
|
||||
full_zero_side_nonzero = (((df[c].isna()) | (df[c] == 0)) & side_any_nonzero).sum()
|
||||
print(f"{c} full null: {full_null} full zero: {full_zero} full nonzero: {full_nonzero}")
|
||||
print(f"{c} side t nonzero: {side_t_nonzero} side ct nonzero: {side_ct_nonzero}")
|
||||
print(f"{c} full nonzero but side all zero: {full_nonzero_side_zero}")
|
||||
print(f"{c} full zero but side has: {full_zero_side_nonzero}")
|
||||
|
||||
df_rating_src = pd.read_sql(
|
||||
"SELECT f.rating, f.rating2, f.rating3, m.data_source_type FROM fact_match_players f JOIN fact_matches m ON f.match_id = m.match_id",
|
||||
conn
|
||||
)
|
||||
for c in rating_cols:
|
||||
grp = df_rating_src.groupby('data_source_type')[c].apply(lambda s: (s != 0).sum()).reset_index(name='nonzero')
|
||||
print(f"{c} nonzero by source")
|
||||
print(grp)
|
||||
|
||||
print("\n--- Schema Coverage (fight_any) ---")
|
||||
schema_path = 'database/original_json_schema/schema_flat.csv'
|
||||
paths = []
|
||||
with open(schema_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
_ = next(reader, None)
|
||||
for row in reader:
|
||||
if len(row) >= 2:
|
||||
paths.append(row[1])
|
||||
fight_keys = set()
|
||||
for p in paths:
|
||||
if 'data.group_N[].fight_any.' in p:
|
||||
key = p.split('fight_any.')[1].split('.')[0]
|
||||
fight_keys.add(key)
|
||||
l2_cols = set(pd.read_sql("PRAGMA table_info(fact_match_players)", conn)['name'].tolist())
|
||||
alias = {
|
||||
'kills': 'kill',
|
||||
'deaths': 'death',
|
||||
'assists': 'assist',
|
||||
'headshot_count': 'headshot',
|
||||
'mvp_count': 'is_mvp',
|
||||
'flash_duration': 'flash_enemy_time',
|
||||
'jump_count': 'jump_total',
|
||||
'awp_kills': 'awp_kill'
|
||||
}
|
||||
covered = set()
|
||||
for c in l2_cols:
|
||||
if c in fight_keys:
|
||||
covered.add(c)
|
||||
elif c in alias and alias[c] in fight_keys:
|
||||
covered.add(alias[c])
|
||||
missing_keys = sorted(list(fight_keys - covered))
|
||||
print(f"fight_any keys: {len(fight_keys)}")
|
||||
print(f"covered by L2 columns: {len(covered)}")
|
||||
print(f"uncovered fight_any keys: {len(missing_keys)}")
|
||||
if missing_keys:
|
||||
print(missing_keys)
|
||||
|
||||
print("\n--- Coverage Zero Rate (fight_any -> fact_match_players) ---")
|
||||
fight_cols = [k for k in fight_keys if k in l2_cols or k in alias.values()]
|
||||
col_map = {}
|
||||
for k in fight_cols:
|
||||
if k in l2_cols:
|
||||
col_map[k] = k
|
||||
else:
|
||||
for l2k, src in alias.items():
|
||||
if src == k:
|
||||
col_map[k] = l2k
|
||||
break
|
||||
select_cols = ["steam_id_64"] + list(set(col_map.values()))
|
||||
df_fight = pd.read_sql(
|
||||
"SELECT " + ",".join(select_cols) + " FROM fact_match_players",
|
||||
conn
|
||||
)
|
||||
total_rows = len(df_fight)
|
||||
stats = []
|
||||
for fight_key, col in sorted(col_map.items()):
|
||||
s = df_fight[col]
|
||||
zeros = (s == 0).sum()
|
||||
nulls = s.isna().sum()
|
||||
nonzero = total_rows - zeros - nulls
|
||||
stats.append({
|
||||
"fight_key": fight_key,
|
||||
"column": col,
|
||||
"nonzero": nonzero,
|
||||
"zero": zeros,
|
||||
"null": nulls,
|
||||
"zero_rate": 0 if total_rows == 0 else round(zeros / total_rows, 4)
|
||||
})
|
||||
df_stats = pd.DataFrame(stats).sort_values(["zero_rate", "nonzero"], ascending=[False, True])
|
||||
print(df_stats.head(30))
|
||||
print("\n-- zero_rate top (most zeros) --")
|
||||
print(df_stats.head(10))
|
||||
print("\n-- zero_rate bottom (most nonzero) --")
|
||||
print(df_stats.tail(10))
|
||||
|
||||
print("\n--- Schema Coverage (leetify economy) ---")
|
||||
econ_keys = [
|
||||
'data.leetify_data.round_stat[].bron_equipment.',
|
||||
'data.leetify_data.round_stat[].player_t_score.',
|
||||
'data.leetify_data.round_stat[].player_ct_score.',
|
||||
'data.leetify_data.round_stat[].player_bron_crash.'
|
||||
]
|
||||
for k in econ_keys:
|
||||
count = sum(1 for p in paths if k in p)
|
||||
print(f"{k} paths: {count}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
verify()
|
||||
Reference in New Issue
Block a user