diff --git a/ETL/L2_Builder.py b/ETL/L2_Builder.py index ac3ef47..79adfc9 100644 --- a/ETL/L2_Builder.py +++ b/ETL/L2_Builder.py @@ -643,6 +643,16 @@ class MatchParser: side_stats.throw_harm_enemy = safe_int(fight_side.get('throw_harm_enemy')) side_stats.uid = safe_int(fight_side.get('uid')) side_stats.year = safe_text(fight_side.get('year')) + + # Map missing fields + side_stats.clutch_1v1 = side_stats.end_1v1 + side_stats.clutch_1v2 = side_stats.end_1v2 + side_stats.clutch_1v3 = side_stats.end_1v3 + side_stats.clutch_1v4 = side_stats.end_1v4 + side_stats.clutch_1v5 = side_stats.end_1v5 + side_stats.entry_kills = side_stats.first_kill + side_stats.entry_deaths = side_stats.first_death + return side_stats team_id_value = safe_int(fight.get('match_team_id')) @@ -725,6 +735,15 @@ class MatchParser: stats.uid = safe_int(fight.get('uid')) stats.year = safe_text(fight.get('year')) + # Map missing fields + stats.clutch_1v1 = stats.end_1v1 + stats.clutch_1v2 = stats.end_1v2 + stats.clutch_1v3 = stats.end_1v3 + stats.clutch_1v4 = stats.end_1v4 + stats.clutch_1v5 = stats.end_1v5 + stats.entry_kills = stats.first_kill + stats.entry_deaths = stats.first_death + except Exception as e: logger.error(f"Error parsing stats for {steam_id} in {self.match_id}: {e}") pass @@ -754,6 +773,9 @@ class MatchParser: p.fd_t = int(vdata.get('fd_t', 0)) p.damage_receive = int(vdata.get('damage_receive', 0)) p.damage_stats = int(vdata.get('damage_stats', 0)) + p.damage_total = int(vdata.get('damage_total', 0)) + p.damage_received = int(vdata.get('damage_received', 0)) + p.flash_assists = int(vdata.get('flash_assists', 0)) else: # Try to match by 5E ID if possible, but here keys are steamids usually pass @@ -888,12 +910,26 @@ class MatchParser: equipment_value = player_bron_crash.get(str(sid)) equipment_value = int(equipment_value) if equipment_value is not None else 0 main_weapon = pick_main_weapon(items) + + has_helmet = False + has_defuser = False + if isinstance(items, list): + for it in items: + if isinstance(it, dict): + name = it.get('WeaponName', '') + if name == 'item_assaultsuit': + has_helmet = True + elif name == 'item_defuser': + has_defuser = True + rd.economies.append(PlayerEconomy( steam_id_64=str(sid), side=side, start_money=start_money, equipment_value=equipment_value, main_weapon=main_weapon, + has_helmet=has_helmet, + has_defuser=has_defuser, round_performance_score=float(score) )) diff --git a/ETL/verify/verify_deep.py b/ETL/verify/verify_deep.py index d7b2d54..f31b1b2 100644 --- a/ETL/verify/verify_deep.py +++ b/ETL/verify/verify_deep.py @@ -1,81 +1,82 @@ import sqlite3 import pandas as pd import numpy as np +import sys +# 设置pandas显示选项,确保不省略任何行和列 +pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) -pd.set_option('display.width', 1000) +pd.set_option('display.width', 2000) pd.set_option('display.float_format', '{:.2f}'.format) +pd.set_option('display.max_colwidth', None) db_path = 'database/L2/L2_Main.sqlite' -def check_nulls_zeros(): +def check_all_tables(): conn = sqlite3.connect(db_path) - print("=== 1. Fact Match Players: 关键字段零值/空值检查 ===") - df_players = pd.read_sql(""" - SELECT - kills, deaths, assists, adr, rating, rating2, - kast, awp_kills, flash_duration, jump_count, - elo_change - FROM fact_match_players - """, conn) - - stats = [] - for col in df_players.columns: - total = len(df_players) - nulls = df_players[col].isnull().sum() - zeros = (df_players[col] == 0).sum() - stats.append({ - 'Field': col, - 'Total': total, - 'Nulls': nulls, - 'Null%': (nulls/total)*100, - 'Zeros': zeros, - 'Zero%': (zeros/total)*100 - }) - print(pd.DataFrame(stats)) - - print("\n=== 2. Fact Round Events (Kills): 击杀完整性检查 ===") - # 只检查 event_type = 'kill' 的记录 - df_kills = pd.read_sql(""" - SELECT - attacker_steam_id, victim_steam_id, - event_time, weapon, - attacker_pos_x, score_change_attacker - FROM fact_round_events - WHERE event_type = 'kill' - """, conn) - - total_kills = len(df_kills) - missing_attacker = df_kills['attacker_steam_id'].isnull().sum() + (df_kills['attacker_steam_id'] == '').sum() - missing_victim = df_kills['victim_steam_id'].isnull().sum() + (df_kills['victim_steam_id'] == '').sum() - - # 检查 attacker 和 victim 是否相同(自杀) - self_kills = (df_kills['attacker_steam_id'] == df_kills['victim_steam_id']).sum() - - print(f"Total Kill Events: {total_kills}") - print(f"Missing Attacker: {missing_attacker} ({missing_attacker/total_kills*100:.2f}%)") - print(f"Missing Victim: {missing_victim} ({missing_victim/total_kills*100:.2f}%)") - print(f"Self Kills (Suicide?): {self_kills}") - - print("\n=== 3. Fact Round Events: 坐标与评分覆盖率 ===") - # 坐标应该在 classic 比赛中有值,leetify 比赛中可能为空 - # 评分应该在 leetify 比赛中有值 - - df_events = pd.read_sql(""" - SELECT - m.data_source_type, - COUNT(*) as total_events, - SUM(CASE WHEN e.attacker_pos_x IS NOT NULL AND e.attacker_pos_x != 0 THEN 1 ELSE 0 END) as has_pos, - SUM(CASE WHEN e.score_change_attacker IS NOT NULL AND e.score_change_attacker != 0 THEN 1 ELSE 0 END) as has_score - FROM fact_round_events e - JOIN fact_matches m ON e.match_id = m.match_id - WHERE e.event_type = 'kill' - GROUP BY m.data_source_type - """, conn) - print(df_events) + # 获取所有表名 + tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'", conn)['name'].tolist() + for table in tables: + print(f"\n{'='*20} Table: {table} {'='*20}") + + # 获取表的所有列 + cols_info = pd.read_sql(f"PRAGMA table_info({table})", conn) + cols = cols_info['name'].tolist() + + # 读取全表数据 + df = pd.read_sql(f"SELECT * FROM {table}", conn) + total = len(df) + + if total == 0: + print(f"Table is empty (0 rows)") + continue + + print(f"Total Rows: {total}") + print("-" * 60) + + stats = [] + for col in cols: + # 1. Null Check + nulls = df[col].isnull().sum() + + # 2. Zero Check (仅对数值型或可转换为数值的列) + zeros = 0 + try: + # 尝试转为数值,无法转换的变为NaN + numeric_series = pd.to_numeric(df[col], errors='coerce') + # 统计0值 (排除原本就是NaN的) + zeros = (numeric_series == 0).sum() + except: + zeros = 0 + + # 3. Unique Count (基数) + unique_count = df[col].nunique() + + # 4. Example Value (取第一个非空值) + example = df[col].dropna().iloc[0] if df[col].count() > 0 else 'ALL NULL' + + stats.append({ + 'Field': col, + 'Nulls': nulls, + 'Null%': (nulls/total)*100, + 'Zeros': zeros, + 'Zero%': (zeros/total)*100, + 'Unique': unique_count, + 'Example': str(example)[:50] # 截断过长示例 + }) + + # 输出完整统计表 + df_stats = pd.DataFrame(stats) + # 按 Zero% 降序排列,但保证 Null% 高的也显眼,这里默认不排序直接按字段序,或者按关注度排序 + # 用户要求全面探查,按字段原序输出可能更符合直觉,或者按Zero%排序 + # 这里为了排查问题,按 Zero% 降序输出 + df_stats = df_stats.sort_values('Zero%', ascending=False) + print(df_stats.to_string(index=False)) + print("\n") + conn.close() if __name__ == "__main__": - check_nulls_zeros() + check_all_tables() diff --git a/ETL/verify/verify_report.txt b/ETL/verify/verify_report.txt new file mode 100644 index 0000000..641b571 Binary files /dev/null and b/ETL/verify/verify_report.txt differ diff --git a/database/L2/L2_Main.sqlite b/database/L2/L2_Main.sqlite index a163f6f..6f58450 100644 Binary files a/database/L2/L2_Main.sqlite and b/database/L2/L2_Main.sqlite differ