import sqlite3 import pandas as pd import csv pd.set_option('display.max_columns', None) pd.set_option('display.width', 1000) db_path = 'database/L2/L2_Main.sqlite' def verify(): conn = sqlite3.connect(db_path) print("--- Counts ---") tables = [ 'dim_players', 'dim_maps', 'fact_matches', 'fact_match_players', 'fact_match_players_t', 'fact_match_players_ct', 'fact_rounds', 'fact_round_events', 'fact_round_player_economy' ] for t in tables: count = conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0] print(f"{t}: {count}") print("\n--- Data Source Distribution ---") dist = pd.read_sql("SELECT data_source_type, COUNT(*) as cnt FROM fact_matches GROUP BY data_source_type", conn) print(dist) print("\n--- Sample Round Events (Leetify vs Classic) ---") # Fetch one event from a leetify match leetify_match = conn.execute("SELECT match_id FROM fact_matches WHERE data_source_type='leetify' LIMIT 1").fetchone() if leetify_match: mid = leetify_match[0] print(f"Leetify Match: {mid}") df = pd.read_sql(f"SELECT * FROM fact_round_events WHERE match_id='{mid}' AND event_type='kill' LIMIT 1", conn) print(df[['event_type', 'attacker_steam_id', 'trade_killer_steam_id', 'attacker_pos_x', 'score_change_attacker']]) # Fetch one event from a classic match classic_match = conn.execute("SELECT match_id FROM fact_matches WHERE data_source_type='classic' LIMIT 1").fetchone() if classic_match: mid = classic_match[0] print(f"Classic Match: {mid}") df = pd.read_sql(f"SELECT * FROM fact_round_events WHERE match_id='{mid}' AND event_type='kill' LIMIT 1", conn) print(df[['event_type', 'attacker_steam_id', 'trade_killer_steam_id', 'attacker_pos_x', 'score_change_attacker']]) print("\n--- Sample Player Stats (New Fields) ---") df_players = pd.read_sql("SELECT steam_id_64, rating, rating3, elo_change, rank_score, flash_duration, jump_count FROM fact_match_players LIMIT 5", conn) print(df_players) print("\n--- Integrity Checks ---") missing_players = conn.execute(""" SELECT COUNT(*) FROM fact_match_players f LEFT JOIN dim_players d ON f.steam_id_64 = d.steam_id_64 WHERE d.steam_id_64 IS NULL """).fetchone()[0] print(f"fact_match_players missing dim_players: {missing_players}") missing_round_matches = conn.execute(""" SELECT COUNT(*) FROM fact_rounds r LEFT JOIN fact_matches m ON r.match_id = m.match_id WHERE m.match_id IS NULL """).fetchone()[0] print(f"fact_rounds missing fact_matches: {missing_round_matches}") missing_event_rounds = conn.execute(""" SELECT COUNT(*) FROM fact_round_events e LEFT JOIN fact_rounds r ON e.match_id = r.match_id AND e.round_num = r.round_num WHERE r.match_id IS NULL """).fetchone()[0] print(f"fact_round_events missing fact_rounds: {missing_event_rounds}") side_zero_t = conn.execute(""" SELECT COUNT(*) FROM fact_match_players_t WHERE COALESCE(kills,0)=0 AND COALESCE(deaths,0)=0 AND COALESCE(assists,0)=0 """).fetchone()[0] side_zero_ct = conn.execute(""" SELECT COUNT(*) FROM fact_match_players_ct WHERE COALESCE(kills,0)=0 AND COALESCE(deaths,0)=0 AND COALESCE(assists,0)=0 """).fetchone()[0] print(f"fact_match_players_t zero K/D/A: {side_zero_t}") print(f"fact_match_players_ct zero K/D/A: {side_zero_ct}") print("\n--- Full vs T/CT Comparison ---") cols = [ 'kills', 'deaths', 'assists', 'headshot_count', 'adr', 'rating', 'rating2', 'rating3', 'rws', 'mvp_count', 'flash_duration', 'jump_count', 'is_win' ] df_full = pd.read_sql( "SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players", conn ) df_t = pd.read_sql( "SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players_t", conn ).rename(columns={c: f"{c}_t" for c in cols}) df_ct = pd.read_sql( "SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players_ct", conn ).rename(columns={c: f"{c}_ct" for c in cols}) df = df_full.merge(df_t, on=['match_id', 'steam_id_64'], how='left') df = df.merge(df_ct, on=['match_id', 'steam_id_64'], how='left') def is_empty(s): return s.isna() | (s == 0) for c in cols: empty_count = is_empty(df[c]).sum() print(f"{c} empty: {empty_count}") additive = ['kills', 'deaths', 'assists', 'headshot_count', 'mvp_count', 'flash_duration', 'jump_count'] for c in additive: t_sum = df[f"{c}_t"].fillna(0) + df[f"{c}_ct"].fillna(0) tol = 0.01 if c == 'flash_duration' else 0 diff = (df[c].fillna(0) - t_sum).abs() > tol print(f"{c} full != t+ct: {diff.sum()}") non_additive = ['adr', 'rating', 'rating2', 'rating3', 'rws', 'is_win'] for c in non_additive: side_nonempty = (~is_empty(df[f"{c}_t"])) | (~is_empty(df[f"{c}_ct"])) full_empty_side_nonempty = is_empty(df[c]) & side_nonempty full_nonempty_side_empty = (~is_empty(df[c])) & (~side_nonempty) print(f"{c} full empty but side has: {full_empty_side_nonempty.sum()}") print(f"{c} full has but side empty: {full_nonempty_side_empty.sum()}") print("\n--- Rating Detail ---") rating_cols = ['rating', 'rating2', 'rating3'] for c in rating_cols: full_null = df[c].isna().sum() full_zero = (df[c] == 0).sum() full_nonzero = ((~df[c].isna()) & (df[c] != 0)).sum() side_t_nonzero = ((~df[f"{c}_t"].isna()) & (df[f"{c}_t"] != 0)).sum() side_ct_nonzero = ((~df[f"{c}_ct"].isna()) & (df[f"{c}_ct"] != 0)).sum() side_any_nonzero = ((~df[f"{c}_t"].isna()) & (df[f"{c}_t"] != 0)) | ((~df[f"{c}_ct"].isna()) & (df[f"{c}_ct"] != 0)) full_nonzero_side_zero = ((~df[c].isna()) & (df[c] != 0) & (~side_any_nonzero)).sum() full_zero_side_nonzero = (((df[c].isna()) | (df[c] == 0)) & side_any_nonzero).sum() print(f"{c} full null: {full_null} full zero: {full_zero} full nonzero: {full_nonzero}") print(f"{c} side t nonzero: {side_t_nonzero} side ct nonzero: {side_ct_nonzero}") print(f"{c} full nonzero but side all zero: {full_nonzero_side_zero}") print(f"{c} full zero but side has: {full_zero_side_nonzero}") df_rating_src = pd.read_sql( "SELECT f.rating, f.rating2, f.rating3, m.data_source_type FROM fact_match_players f JOIN fact_matches m ON f.match_id = m.match_id", conn ) for c in rating_cols: grp = df_rating_src.groupby('data_source_type')[c].apply(lambda s: (s != 0).sum()).reset_index(name='nonzero') print(f"{c} nonzero by source") print(grp) print("\n--- Schema Coverage (fight_any) ---") schema_path = 'database/original_json_schema/schema_flat.csv' paths = [] with open(schema_path, 'r', encoding='utf-8') as f: reader = csv.reader(f) _ = next(reader, None) for row in reader: if len(row) >= 2: paths.append(row[1]) fight_keys = set() for p in paths: if 'data.group_N[].fight_any.' in p: key = p.split('fight_any.')[1].split('.')[0] fight_keys.add(key) l2_cols = set(pd.read_sql("PRAGMA table_info(fact_match_players)", conn)['name'].tolist()) alias = { 'kills': 'kill', 'deaths': 'death', 'assists': 'assist', 'headshot_count': 'headshot', 'mvp_count': 'is_mvp', 'flash_duration': 'flash_enemy_time', 'jump_count': 'jump_total', 'awp_kills': 'awp_kill' } covered = set() for c in l2_cols: if c in fight_keys: covered.add(c) elif c in alias and alias[c] in fight_keys: covered.add(alias[c]) missing_keys = sorted(list(fight_keys - covered)) print(f"fight_any keys: {len(fight_keys)}") print(f"covered by L2 columns: {len(covered)}") print(f"uncovered fight_any keys: {len(missing_keys)}") if missing_keys: print(missing_keys) print("\n--- Coverage Zero Rate (fight_any -> fact_match_players) ---") fight_cols = [k for k in fight_keys if k in l2_cols or k in alias.values()] col_map = {} for k in fight_cols: if k in l2_cols: col_map[k] = k else: for l2k, src in alias.items(): if src == k: col_map[k] = l2k break select_cols = ["steam_id_64"] + list(set(col_map.values())) df_fight = pd.read_sql( "SELECT " + ",".join(select_cols) + " FROM fact_match_players", conn ) total_rows = len(df_fight) stats = [] for fight_key, col in sorted(col_map.items()): s = df_fight[col] zeros = (s == 0).sum() nulls = s.isna().sum() nonzero = total_rows - zeros - nulls stats.append({ "fight_key": fight_key, "column": col, "nonzero": nonzero, "zero": zeros, "null": nulls, "zero_rate": 0 if total_rows == 0 else round(zeros / total_rows, 4) }) df_stats = pd.DataFrame(stats).sort_values(["zero_rate", "nonzero"], ascending=[False, True]) print(df_stats.head(30)) print("\n-- zero_rate top (most zeros) --") print(df_stats.head(10)) print("\n-- zero_rate bottom (most nonzero) --") print(df_stats.tail(10)) print("\n--- Schema Coverage (leetify economy) ---") econ_keys = [ 'data.leetify_data.round_stat[].bron_equipment.', 'data.leetify_data.round_stat[].player_t_score.', 'data.leetify_data.round_stat[].player_ct_score.', 'data.leetify_data.round_stat[].player_bron_crash.' ] for k in econ_keys: count = sum(1 for p in paths if k in p) print(f"{k} paths: {count}") conn.close() if __name__ == "__main__": verify()