Remove test files.

This commit is contained in:
2026-01-28 17:00:47 +08:00
parent 582423a72f
commit 1642adb00e
11 changed files with 1043 additions and 978 deletions

View File

@@ -1,39 +0,0 @@
import sqlite3
import os
# 路径指向正式数据库
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
DB_PATH = os.path.join(BASE_DIR, 'database', 'L1A', 'L1A.sqlite')
def clean_db():
if not os.path.exists(DB_PATH):
print(f"Database not found at {DB_PATH}")
return
print(f"Connecting to production DB: {DB_PATH}")
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# 查找脏数据 (假设模拟数据的 match_id 是 match_001, match_002, match_003)
dirty_ids = ['match_001', 'match_002', 'match_003']
# 也可以用 LIKE 'match_%' 如果您想删得更彻底,但要小心误删
# 这里我们精准删除
deleted_count = 0
for mid in dirty_ids:
cursor.execute("DELETE FROM raw_iframe_network WHERE match_id = ?", (mid,))
if cursor.rowcount > 0:
print(f"Deleted dirty record: {mid}")
deleted_count += 1
conn.commit()
conn.close()
if deleted_count > 0:
print(f"Cleanup complete. Removed {deleted_count} dirty records.")
else:
print("Cleanup complete. No dirty records found.")
if __name__ == "__main__":
clean_db()

View File

@@ -1,35 +0,0 @@
import os
import json
# 定义路径
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(os.path.dirname(CURRENT_DIR))
OUTPUT_ARENA_DIR = os.path.join(PROJECT_ROOT, 'output_arena')
def create_mock_data():
if not os.path.exists(OUTPUT_ARENA_DIR):
os.makedirs(OUTPUT_ARENA_DIR)
print(f"Created directory: {OUTPUT_ARENA_DIR}")
# 创建 3 个模拟比赛数据
mock_matches = ['match_001', 'match_002', 'match_003']
for match_id in mock_matches:
match_dir = os.path.join(OUTPUT_ARENA_DIR, match_id)
if not os.path.exists(match_dir):
os.makedirs(match_dir)
file_path = os.path.join(match_dir, 'iframe_network.json')
if not os.path.exists(file_path):
mock_content = {
"match_id": match_id,
"data": "This is mock data for testing."
}
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(mock_content, f)
print(f"Created mock file: {file_path}")
else:
print(f"File already exists: {file_path}")
if __name__ == "__main__":
create_mock_data()

View File

@@ -1,76 +0,0 @@
import os
import sqlite3
import subprocess
import glob
# 配置路径
# 当前脚本位于 ETL/verify/ 目录下,需要向上两级找到项目根目录
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(os.path.dirname(CURRENT_DIR))
L1_SCRIPT = os.path.join(PROJECT_ROOT, 'ETL', 'L1A.py')
DB_PATH = os.path.join(PROJECT_ROOT, 'database', 'L1A', 'L1A.sqlite')
OUTPUT_ARENA_DIR = os.path.join(PROJECT_ROOT, 'output_arena')
def get_db_count():
"""获取数据库中的记录数"""
if not os.path.exists(DB_PATH):
return 0
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM raw_iframe_network")
count = cursor.fetchone()[0]
conn.close()
return count
except Exception:
return 0
def get_file_count():
"""获取源文件总数"""
pattern = os.path.join(OUTPUT_ARENA_DIR, '*', 'iframe_network.json')
files = glob.glob(pattern)
return len(files)
def run_l1_script():
"""运行 L1 脚本并返回输出"""
# 必须在项目根目录下运行,或者正确处理 Python 路径
# 这里我们使用绝对路径调用脚本
result = subprocess.run(['python', L1_SCRIPT], capture_output=True, text=True)
return result.stdout
def main():
print("=== 开始 L1 增量逻辑测试 ===")
print(f"项目根目录: {PROJECT_ROOT}")
# 1. 检查环境
total_files = get_file_count()
initial_db_count = get_db_count()
print(f"[环境] 源文件总数: {total_files}")
print(f"[环境] 数据库当前记录数: {initial_db_count}")
# 2. 运行脚本 (第一次)
print("\n--- 运行 L1A.py (Run 1) ---")
output1 = run_l1_script()
print(output1.strip())
mid_db_count = get_db_count()
print(f"[状态] 运行后数据库记录数: {mid_db_count}")
if mid_db_count < total_files:
print("警告: 数据库记录数少于文件数,可能部分文件处理失败或尚未完成。")
# 3. 运行脚本 (第二次 - 验证增量)
print("\n--- 再次运行 L1A.py (Run 2 - 验证增量) ---")
output2 = run_l1_script()
print(output2.strip())
# 4. 验证结果
expected_msg = f"Skipped: {total_files}"
if expected_msg in output2:
print("\n✅ 测试通过! 第二次运行跳过了所有文件,增量逻辑生效。")
else:
print(f"\n❌ 测试未通过。预期输出应包含 '{expected_msg}'")
if __name__ == "__main__":
main()

Binary file not shown.

View File

@@ -1,504 +0,0 @@
import sqlite3
import pandas as pd
import csv
import os
import sys
import time
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
db_path = 'database/L2/L2_Main.sqlite'
schema_path = 'database/original_json_schema/schema_flat.csv'
covered_main_fields = {
"match_code", "map", "start_time", "end_time", "match_winner",
"group1_all_score", "group1_change_elo", "group1_fh_role", "group1_fh_score",
"group1_origin_elo", "group1_sh_role", "group1_sh_score", "group1_tid", "group1_uids",
"group2_all_score", "group2_change_elo", "group2_fh_role", "group2_fh_score",
"group2_origin_elo", "group2_sh_role", "group2_sh_score", "group2_tid", "group2_uids",
"server_ip", "server_port", "location", "location_full", "map_desc",
"demo_url", "game_mode", "game_name", "match_mode", "match_status", "match_flag",
"status", "waiver", "year", "season", "round_total", "cs_type", "priority_show_type",
"pug10m_show_type", "credit_match_status", "knife_winner", "knife_winner_role",
"most_1v2_uid", "most_assist_uid", "most_awp_uid", "most_end_uid",
"most_first_kill_uid", "most_headshot_uid", "most_jump_uid", "mvp_uid", "id"
}
covered_user_fields = {
"data.group_N[].user_info."
}
covered_round_fields = [
"data.round_list[].current_score.ct",
"data.round_list[].current_score.t",
"data.round_list[].current_score.final_round_time",
"data.round_list[].all_kill[].pasttime",
"data.round_list[].all_kill[].weapon",
"data.round_list[].all_kill[].headshot",
"data.round_list[].all_kill[].penetrated",
"data.round_list[].all_kill[].attackerblind",
"data.round_list[].all_kill[].throughsmoke",
"data.round_list[].all_kill[].noscope",
"data.round_list[].all_kill[].attacker.steamid_64",
"data.round_list[].all_kill[].victim.steamid_64",
"data.round_list[].all_kill[].attacker.pos.x",
"data.round_list[].all_kill[].attacker.pos.y",
"data.round_list[].all_kill[].attacker.pos.z",
"data.round_list[].all_kill[].victim.pos.x",
"data.round_list[].all_kill[].victim.pos.y",
"data.round_list[].all_kill[].victim.pos.z"
]
covered_leetify_fields = [
"data.leetify_data.round_stat[].round",
"data.leetify_data.round_stat[].win_reason",
"data.leetify_data.round_stat[].end_ts",
"data.leetify_data.round_stat[].sfui_event.score_ct",
"data.leetify_data.round_stat[].sfui_event.score_t",
"data.leetify_data.round_stat[].ct_money_group",
"data.leetify_data.round_stat[].t_money_group",
"data.leetify_data.round_stat[].show_event[].ts",
"data.leetify_data.round_stat[].show_event[].kill_event.Ts",
"data.leetify_data.round_stat[].show_event[].kill_event.Killer",
"data.leetify_data.round_stat[].show_event[].kill_event.Victim",
"data.leetify_data.round_stat[].show_event[].kill_event.WeaponName",
"data.leetify_data.round_stat[].show_event[].kill_event.Headshot",
"data.leetify_data.round_stat[].show_event[].kill_event.Penetrated",
"data.leetify_data.round_stat[].show_event[].kill_event.AttackerBlind",
"data.leetify_data.round_stat[].show_event[].kill_event.ThroughSmoke",
"data.leetify_data.round_stat[].show_event[].kill_event.NoScope",
"data.leetify_data.round_stat[].show_event[].trade_score_change.",
"data.leetify_data.round_stat[].show_event[].flash_assist_killer_score_change.",
"data.leetify_data.round_stat[].show_event[].killer_score_change.",
"data.leetify_data.round_stat[].show_event[].victim_score_change.",
"data.leetify_data.round_stat[].bron_equipment.",
"data.leetify_data.round_stat[].player_t_score.",
"data.leetify_data.round_stat[].player_ct_score.",
"data.leetify_data.round_stat[].player_bron_crash."
]
covered_vip_fields = {
"awp_kill",
"awp_kill_ct",
"awp_kill_t",
"damage_receive",
"damage_stats",
"fd_ct",
"fd_t",
"kast"
}
def load_schema_paths(schema_path_value):
paths = []
with open(schema_path_value, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
_ = next(reader, None)
for row in reader:
if len(row) >= 2:
paths.append(row[1])
return paths
def is_covered(path):
if path in ["data", "code", "message", "status", "timestamp", "timeStamp", "traceId", "success", "errcode"]:
return True
if path.startswith("data.<steamid>."):
key = path.split("data.<steamid>.")[1].split(".")[0]
if key in covered_vip_fields:
return True
if "data.group_N[].fight_any." in path:
return True
if "data.group_N[].fight_t." in path or "data.group_N[].fight_ct." in path:
return True
if "data.group_N[].sts." in path:
return True
if "data.group_N[].level_info." in path:
return True
if "data.treat_info." in path:
return True
if "data.has_side_data_and_rating2" in path:
return True
if "data.main." in path:
key = path.split("data.main.")[1].split(".")[0]
if key in covered_main_fields:
return True
if any(k in path for k in covered_user_fields):
return True
if "data.round_list" in path:
return True
if any(k in path for k in covered_round_fields):
return True
if "data.leetify_data." in path:
return True
if any(k in path for k in covered_leetify_fields):
return True
return False
def group_key(p):
if "data.group_N[].user_info." in p:
return "data.group_N[].user_info.*"
if "data.group_N[].fight_any." in p:
return "data.group_N[].fight_any.*"
if "data.group_N[].fight_t." in p:
return "data.group_N[].fight_t.*"
if "data.group_N[].fight_ct." in p:
return "data.group_N[].fight_ct.*"
if "data.main." in p:
return "data.main.*"
if "data.round_list[]" in p or "data.round_list[]." in p:
return "data.round_list.*"
if "data.leetify_data.round_stat[]" in p or "data.leetify_data.round_stat[]." in p:
return "data.leetify_data.round_stat.*"
if "data.leetify_data." in p:
return "data.leetify_data.*"
if "data.treat_info." in p:
return "data.treat_info.*"
if "data." in p:
return "data.*"
return "other"
def dump_uncovered(output_path):
paths = load_schema_paths(schema_path)
uncovered = [p for p in paths if not is_covered(p)]
df_unc = pd.DataFrame({"path": uncovered})
if len(df_unc) == 0:
print("no uncovered paths")
return
df_unc["group"] = df_unc["path"].apply(group_key)
df_unc = df_unc.sort_values(["group", "path"])
df_unc.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"uncovered total: {len(df_unc)}")
print("\n-- uncovered groups (count) --")
print(df_unc.groupby("group").size().sort_values(ascending=False))
print(f"\noutput: {output_path}")
def print_schema(conn):
tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name").fetchall()
for (name,) in tables:
print(f"\n[{name}]")
cols = conn.execute(f"PRAGMA table_info({name})").fetchall()
rows = [["column", "type", "pk"]]
for _, col_name, col_type, _, _, pk in cols:
rows.append([col_name, col_type or "", str(pk)])
widths = [max(len(r[i]) for r in rows) for i in range(3)]
for idx, r in enumerate(rows):
line = " | ".join([r[i].ljust(widths[i]) for i in range(3)])
print(line)
if idx == 0:
print("-" * len(line))
def refresh_schema_sql(conn, output_path):
rows = conn.execute("""
SELECT type, name, sql
FROM sqlite_master
WHERE sql IS NOT NULL AND type IN ('table', 'index') AND name NOT LIKE 'sqlite_%'
ORDER BY CASE WHEN type='table' THEN 0 ELSE 1 END, name
""").fetchall()
lines = ["PRAGMA foreign_keys = ON;", ""]
for _, _, sql in rows:
lines.append(sql.strip() + ";")
lines.append("")
with open(output_path, 'w', encoding='utf-8') as f:
f.write("\n".join(lines).strip() + "\n")
def verify():
conn = sqlite3.connect(db_path)
print("--- Counts ---")
tables = [
'dim_players',
'dim_maps',
'fact_matches',
'fact_match_teams',
'fact_match_players',
'fact_match_players_t',
'fact_match_players_ct',
'fact_rounds',
'fact_round_events',
'fact_round_player_economy'
]
for t in tables:
count = conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()[0]
print(f"{t}: {count}")
print("\n--- Data Source Distribution ---")
dist = pd.read_sql("SELECT data_source_type, COUNT(*) as cnt FROM fact_matches GROUP BY data_source_type", conn)
print(dist)
print("\n--- Sample Round Events (Leetify vs Classic) ---")
# Fetch one event from a leetify match
leetify_match = conn.execute("SELECT match_id FROM fact_matches WHERE data_source_type='leetify' LIMIT 1").fetchone()
if leetify_match:
mid = leetify_match[0]
print(f"Leetify Match: {mid}")
df = pd.read_sql(f"SELECT * FROM fact_round_events WHERE match_id='{mid}' AND event_type='kill' LIMIT 1", conn)
print(df[['event_type', 'attacker_steam_id', 'trade_killer_steam_id', 'attacker_pos_x', 'score_change_attacker']])
# Fetch one event from a classic match
classic_match = conn.execute("SELECT match_id FROM fact_matches WHERE data_source_type='classic' LIMIT 1").fetchone()
if classic_match:
mid = classic_match[0]
print(f"Classic Match: {mid}")
df = pd.read_sql(f"SELECT * FROM fact_round_events WHERE match_id='{mid}' AND event_type='kill' LIMIT 1", conn)
print(df[['event_type', 'attacker_steam_id', 'trade_killer_steam_id', 'attacker_pos_x', 'score_change_attacker']])
print("\n--- Sample Player Stats (New Fields) ---")
df_players = pd.read_sql("SELECT steam_id_64, rating, rating3, elo_change, rank_score, flash_duration, jump_count FROM fact_match_players LIMIT 5", conn)
print(df_players)
print("\n--- Insert Field Checks ---")
meta_counts = conn.execute("""
SELECT
SUM(CASE WHEN response_code IS NOT NULL THEN 1 ELSE 0 END) AS response_code_cnt,
SUM(CASE WHEN response_trace_id IS NOT NULL AND response_trace_id != '' THEN 1 ELSE 0 END) AS response_trace_id_cnt,
SUM(CASE WHEN response_success IS NOT NULL THEN 1 ELSE 0 END) AS response_success_cnt,
SUM(CASE WHEN response_errcode IS NOT NULL THEN 1 ELSE 0 END) AS response_errcode_cnt,
SUM(CASE WHEN treat_info_raw IS NOT NULL AND treat_info_raw != '' THEN 1 ELSE 0 END) AS treat_info_raw_cnt,
SUM(CASE WHEN round_list_raw IS NOT NULL AND round_list_raw != '' THEN 1 ELSE 0 END) AS round_list_raw_cnt,
SUM(CASE WHEN leetify_data_raw IS NOT NULL AND leetify_data_raw != '' THEN 1 ELSE 0 END) AS leetify_data_raw_cnt
FROM fact_matches
""").fetchone()
print(f"response_code non-null: {meta_counts[0]}")
print(f"response_trace_id non-empty: {meta_counts[1]}")
print(f"response_success non-null: {meta_counts[2]}")
print(f"response_errcode non-null: {meta_counts[3]}")
print(f"treat_info_raw non-empty: {meta_counts[4]}")
print(f"round_list_raw non-empty: {meta_counts[5]}")
print(f"leetify_data_raw non-empty: {meta_counts[6]}")
print("\n--- Integrity Checks ---")
missing_players = conn.execute("""
SELECT COUNT(*) FROM fact_match_players f
LEFT JOIN dim_players d ON f.steam_id_64 = d.steam_id_64
WHERE d.steam_id_64 IS NULL
""").fetchone()[0]
print(f"fact_match_players missing dim_players: {missing_players}")
missing_round_matches = conn.execute("""
SELECT COUNT(*) FROM fact_rounds r
LEFT JOIN fact_matches m ON r.match_id = m.match_id
WHERE m.match_id IS NULL
""").fetchone()[0]
print(f"fact_rounds missing fact_matches: {missing_round_matches}")
missing_event_rounds = conn.execute("""
SELECT COUNT(*) FROM fact_round_events e
LEFT JOIN fact_rounds r ON e.match_id = r.match_id AND e.round_num = r.round_num
WHERE r.match_id IS NULL
""").fetchone()[0]
print(f"fact_round_events missing fact_rounds: {missing_event_rounds}")
side_zero_t = conn.execute("""
SELECT COUNT(*) FROM fact_match_players_t
WHERE COALESCE(kills,0)=0 AND COALESCE(deaths,0)=0 AND COALESCE(assists,0)=0
""").fetchone()[0]
side_zero_ct = conn.execute("""
SELECT COUNT(*) FROM fact_match_players_ct
WHERE COALESCE(kills,0)=0 AND COALESCE(deaths,0)=0 AND COALESCE(assists,0)=0
""").fetchone()[0]
print(f"fact_match_players_t zero K/D/A: {side_zero_t}")
print(f"fact_match_players_ct zero K/D/A: {side_zero_ct}")
print("\n--- Full vs T/CT Comparison ---")
cols = [
'kills', 'deaths', 'assists', 'headshot_count', 'adr', 'rating', 'rating2',
'rating3', 'rws', 'mvp_count', 'flash_duration', 'jump_count', 'is_win'
]
df_full = pd.read_sql(
"SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players",
conn
)
df_t = pd.read_sql(
"SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players_t",
conn
).rename(columns={c: f"{c}_t" for c in cols})
df_ct = pd.read_sql(
"SELECT match_id, steam_id_64, " + ",".join(cols) + " FROM fact_match_players_ct",
conn
).rename(columns={c: f"{c}_ct" for c in cols})
df = df_full.merge(df_t, on=['match_id', 'steam_id_64'], how='left')
df = df.merge(df_ct, on=['match_id', 'steam_id_64'], how='left')
def is_empty(s):
return s.isna() | (s == 0)
for c in cols:
empty_count = is_empty(df[c]).sum()
print(f"{c} empty: {empty_count}")
additive = ['kills', 'deaths', 'assists', 'headshot_count', 'mvp_count', 'flash_duration', 'jump_count']
for c in additive:
t_sum = df[f"{c}_t"].fillna(0) + df[f"{c}_ct"].fillna(0)
tol = 0.01 if c == 'flash_duration' else 0
diff = (df[c].fillna(0) - t_sum).abs() > tol
print(f"{c} full != t+ct: {diff.sum()}")
non_additive = ['adr', 'rating', 'rating2', 'rating3', 'rws', 'is_win']
for c in non_additive:
side_nonempty = (~is_empty(df[f"{c}_t"])) | (~is_empty(df[f"{c}_ct"]))
full_empty_side_nonempty = is_empty(df[c]) & side_nonempty
full_nonempty_side_empty = (~is_empty(df[c])) & (~side_nonempty)
print(f"{c} full empty but side has: {full_empty_side_nonempty.sum()}")
print(f"{c} full has but side empty: {full_nonempty_side_empty.sum()}")
print("\n--- Rating Detail ---")
rating_cols = ['rating', 'rating2', 'rating3']
for c in rating_cols:
full_null = df[c].isna().sum()
full_zero = (df[c] == 0).sum()
full_nonzero = ((~df[c].isna()) & (df[c] != 0)).sum()
side_t_nonzero = ((~df[f"{c}_t"].isna()) & (df[f"{c}_t"] != 0)).sum()
side_ct_nonzero = ((~df[f"{c}_ct"].isna()) & (df[f"{c}_ct"] != 0)).sum()
side_any_nonzero = ((~df[f"{c}_t"].isna()) & (df[f"{c}_t"] != 0)) | ((~df[f"{c}_ct"].isna()) & (df[f"{c}_ct"] != 0))
full_nonzero_side_zero = ((~df[c].isna()) & (df[c] != 0) & (~side_any_nonzero)).sum()
full_zero_side_nonzero = (((df[c].isna()) | (df[c] == 0)) & side_any_nonzero).sum()
print(f"{c} full null: {full_null} full zero: {full_zero} full nonzero: {full_nonzero}")
print(f"{c} side t nonzero: {side_t_nonzero} side ct nonzero: {side_ct_nonzero}")
print(f"{c} full nonzero but side all zero: {full_nonzero_side_zero}")
print(f"{c} full zero but side has: {full_zero_side_nonzero}")
df_rating_src = pd.read_sql(
"SELECT f.rating, f.rating2, f.rating3, m.data_source_type FROM fact_match_players f JOIN fact_matches m ON f.match_id = m.match_id",
conn
)
for c in rating_cols:
grp = df_rating_src.groupby('data_source_type')[c].apply(lambda s: (s != 0).sum()).reset_index(name='nonzero')
print(f"{c} nonzero by source")
print(grp)
print("\n--- Schema Coverage (fight_any) ---")
paths = load_schema_paths(schema_path)
fight_keys = set()
for p in paths:
if 'data.group_N[].fight_any.' in p:
key = p.split('fight_any.')[1].split('.')[0]
fight_keys.add(key)
l2_cols = set(pd.read_sql("PRAGMA table_info(fact_match_players)", conn)['name'].tolist())
alias = {
'kills': 'kill',
'deaths': 'death',
'assists': 'assist',
'headshot_count': 'headshot',
'mvp_count': 'is_mvp',
'flash_duration': 'flash_enemy_time',
'jump_count': 'jump_total',
'awp_kills': 'awp_kill'
}
covered = set()
for c in l2_cols:
if c in fight_keys:
covered.add(c)
elif c in alias and alias[c] in fight_keys:
covered.add(alias[c])
missing_keys = sorted(list(fight_keys - covered))
print(f"fight_any keys: {len(fight_keys)}")
print(f"covered by L2 columns: {len(covered)}")
print(f"uncovered fight_any keys: {len(missing_keys)}")
if missing_keys:
print(missing_keys)
print("\n--- Coverage Zero Rate (fight_any -> fact_match_players) ---")
fight_cols = [k for k in fight_keys if k in l2_cols or k in alias.values()]
col_map = {}
for k in fight_cols:
if k in l2_cols:
col_map[k] = k
else:
for l2k, src in alias.items():
if src == k:
col_map[k] = l2k
break
select_cols = ["steam_id_64"] + list(set(col_map.values()))
df_fight = pd.read_sql(
"SELECT " + ",".join(select_cols) + " FROM fact_match_players",
conn
)
total_rows = len(df_fight)
stats = []
for fight_key, col in sorted(col_map.items()):
s = df_fight[col]
zeros = (s == 0).sum()
nulls = s.isna().sum()
nonzero = total_rows - zeros - nulls
stats.append({
"fight_key": fight_key,
"column": col,
"nonzero": nonzero,
"zero": zeros,
"null": nulls,
"zero_rate": 0 if total_rows == 0 else round(zeros / total_rows, 4)
})
df_stats = pd.DataFrame(stats).sort_values(["zero_rate", "nonzero"], ascending=[False, True])
print(df_stats.head(30))
print("\n-- zero_rate top (most zeros) --")
print(df_stats.head(10))
print("\n-- zero_rate bottom (most nonzero) --")
print(df_stats.tail(10))
print("\n--- Schema Coverage (leetify economy) ---")
econ_keys = [
'data.leetify_data.round_stat[].bron_equipment.',
'data.leetify_data.round_stat[].player_t_score.',
'data.leetify_data.round_stat[].player_ct_score.',
'data.leetify_data.round_stat[].player_bron_crash.'
]
for k in econ_keys:
count = sum(1 for p in paths if k in p)
print(f"{k} paths: {count}")
print("\n--- Schema Summary Coverage (by path groups) ---")
uncovered = [p for p in paths if not is_covered(p)]
print(f"total paths: {len(paths)}")
print(f"covered paths: {len(paths) - len(uncovered)}")
print(f"uncovered paths: {len(uncovered)}")
df_unc = pd.DataFrame({"path": uncovered})
if len(df_unc) > 0:
df_unc["group"] = df_unc["path"].apply(group_key)
print("\n-- Uncovered groups (count) --")
print(df_unc.groupby("group").size().sort_values(ascending=False))
print("\n-- Uncovered examples (top 50) --")
print(df_unc["path"].head(50).to_list())
conn.close()
def watch_schema(schema_path, interval=1.0):
last_db_mtime = 0
last_schema_mtime = 0
first = True
while True:
if not os.path.exists(db_path):
print(f"db not found: {db_path}")
time.sleep(interval)
continue
db_mtime = os.path.getmtime(db_path)
schema_mtime = os.path.getmtime(schema_path) if os.path.exists(schema_path) else 0
if first or db_mtime > last_db_mtime or schema_mtime > last_schema_mtime:
conn = sqlite3.connect(db_path)
refresh_schema_sql(conn, schema_path)
print(f"\n[{time.strftime('%Y-%m-%d %H:%M:%S')}] schema.sql refreshed")
print_schema(conn)
conn.close()
last_db_mtime = db_mtime
last_schema_mtime = os.path.getmtime(schema_path) if os.path.exists(schema_path) else 0
first = False
time.sleep(interval)
if __name__ == "__main__":
args = [a.lower() for a in sys.argv[1:]]
if "dump_uncovered" in args or "uncovered" in args:
dump_uncovered('database/original_json_schema/uncovered_features.csv')
elif "watch_schema" in args or "watch" in args:
try:
watch_schema('database/L2/schema.sql')
except KeyboardInterrupt:
pass
elif "schema" in args or "refresh_schema" in args:
if not os.path.exists(db_path):
print(f"db not found: {db_path}")
else:
conn = sqlite3.connect(db_path)
if "refresh_schema" in args:
refresh_schema_sql(conn, 'database/L2/schema.sql')
print("schema.sql refreshed")
print_schema(conn)
conn.close()
else:
verify()

View File

@@ -1,29 +0,0 @@
import sqlite3
import pandas as pd
L3_DB_PATH = 'database/L3/L3_Features.sqlite'
def verify():
conn = sqlite3.connect(L3_DB_PATH)
# 1. Row count
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM dm_player_features")
count = cursor.fetchone()[0]
print(f"Total Players in L3: {count}")
# 2. Sample Data
df = pd.read_sql_query("SELECT * FROM dm_player_features LIMIT 5", conn)
print("\nSample Data (First 5 rows):")
print(df[['steam_id_64', 'total_matches', 'basic_avg_rating', 'sta_last_30_rating', 'bat_kd_diff_high_elo', 'hps_clutch_win_rate_1v1']].to_string())
# 3. Stats Summary
print("\nStats Summary:")
full_df = pd.read_sql_query("SELECT basic_avg_rating, sta_last_30_rating, bat_win_rate_vs_all FROM dm_player_features", conn)
print(full_df.describe())
conn.close()
if __name__ == "__main__":
verify()

View File

@@ -1,82 +0,0 @@
import sqlite3
import pandas as pd
import numpy as np
import sys
# 设置pandas显示选项确保不省略任何行和列
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_colwidth', None)
db_path = 'database/L2/L2_Main.sqlite'
def check_all_tables():
conn = sqlite3.connect(db_path)
# 获取所有表名
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'", conn)['name'].tolist()
for table in tables:
print(f"\n{'='*20} Table: {table} {'='*20}")
# 获取表的所有列
cols_info = pd.read_sql(f"PRAGMA table_info({table})", conn)
cols = cols_info['name'].tolist()
# 读取全表数据
df = pd.read_sql(f"SELECT * FROM {table}", conn)
total = len(df)
if total == 0:
print(f"Table is empty (0 rows)")
continue
print(f"Total Rows: {total}")
print("-" * 60)
stats = []
for col in cols:
# 1. Null Check
nulls = df[col].isnull().sum()
# 2. Zero Check (仅对数值型或可转换为数值的列)
zeros = 0
try:
# 尝试转为数值无法转换的变为NaN
numeric_series = pd.to_numeric(df[col], errors='coerce')
# 统计0值 (排除原本就是NaN的)
zeros = (numeric_series == 0).sum()
except:
zeros = 0
# 3. Unique Count (基数)
unique_count = df[col].nunique()
# 4. Example Value (取第一个非空值)
example = df[col].dropna().iloc[0] if df[col].count() > 0 else 'ALL NULL'
stats.append({
'Field': col,
'Nulls': nulls,
'Null%': (nulls/total)*100,
'Zeros': zeros,
'Zero%': (zeros/total)*100,
'Unique': unique_count,
'Example': str(example)[:50] # 截断过长示例
})
# 输出完整统计表
df_stats = pd.DataFrame(stats)
# 按 Zero% 降序排列,但保证 Null% 高的也显眼,这里默认不排序直接按字段序,或者按关注度排序
# 用户要求全面探查按字段原序输出可能更符合直觉或者按Zero%排序
# 这里为了排查问题,按 Zero% 降序输出
df_stats = df_stats.sort_values('Zero%', ascending=False)
print(df_stats.to_string(index=False))
print("\n")
conn.close()
if __name__ == "__main__":
check_all_tables()

File diff suppressed because it is too large Load Diff

View File

@@ -1,74 +0,0 @@
import sqlite3
import pandas as pd
import os
# Config
L2_DB_PATH = r'database/L2/L2_Main.sqlite'
L3_DB_PATH = r'database/L3/L3_Features.sqlite'
def analyze_team_dmg_per_1k():
if not os.path.exists(L3_DB_PATH):
print(f"Error: L3 DB not found at {L3_DB_PATH}")
return
conn_l3 = sqlite3.connect(L3_DB_PATH)
conn_l2 = sqlite3.connect(L2_DB_PATH)
print("--- Analysis: Team Dmg/$1k (Economy Efficiency) ---")
try:
# 1. Get all L3 features
query = """
SELECT f.steam_id_64, f.eco_avg_damage_per_1k, p.username
FROM dm_player_features f
LEFT JOIN dim_players p ON f.steam_id_64 = p.steam_id_64
ORDER BY f.eco_avg_damage_per_1k DESC
"""
# Attach L2 for username lookup
# We can't attach across connections easily in sqlite python without ATTACH DATABASE command
# So let's fetch L3 first, then map names from L2
df_l3 = pd.read_sql_query("SELECT steam_id_64, eco_avg_damage_per_1k FROM dm_player_features", conn_l3)
if df_l3.empty:
print("No data in L3 Features.")
return
# Fetch names
ids = tuple(df_l3['steam_id_64'].tolist())
placeholders = ','.join(['?'] * len(ids))
q_names = f"SELECT steam_id_64, username FROM dim_players WHERE steam_id_64 IN ({placeholders})"
df_names = pd.read_sql_query(q_names, conn_l2, params=ids)
# Merge
df = df_l3.merge(df_names, on='steam_id_64', how='left')
# Sort
df = df.sort_values('eco_avg_damage_per_1k', ascending=False)
print(f"{'Rank':<5} {'Player':<20} {'Dmg/$1k':<10}")
print("-" * 40)
for idx, row in df.iterrows():
rank = idx + 1 # This index is not rank if we iterated row by row after sort, wait.
# reset_index to get rank
pass
df = df.reset_index(drop=True)
for idx, row in df.iterrows():
name = row['username'] if row['username'] else row['steam_id_64']
val = row['eco_avg_damage_per_1k']
print(f"#{idx+1:<4} {name:<20} {val:.2f}")
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
finally:
conn_l2.close()
conn_l3.close()
if __name__ == "__main__":
analyze_team_dmg_per_1k()

View File

@@ -1,45 +0,0 @@
import sqlite3
import pandas as pd
from web.services.feature_service import FeatureService
from web.config import Config
from web.app import create_app
def check_distribution():
app = create_app()
with app.app_context():
# Get a player ID from L3
conn = sqlite3.connect(Config.DB_L3_PATH)
row = conn.execute("SELECT steam_id_64 FROM dm_player_features LIMIT 1").fetchone()
if not row:
print("No players in L3")
return
sid = row[0]
print(f"Checking distribution for {sid}...")
dist = FeatureService.get_roster_features_distribution(sid)
if not dist:
print("Distribution returned None")
return
keys_to_check = [
'eco_avg_damage_per_1k', # Working
'eco_rating_eco_rounds', # Working
'eco_kd_ratio', # Broken
'eco_avg_rounds', # Broken
'pace_avg_time_to_first_contact', # Working
'pace_trade_kill_rate', # Working
'pace_opening_kill_time', # Broken
'pace_avg_life_time' # Broken
]
print(f"{'Key':<35} | {'Present':<7} | {'Value'}")
print("-" * 60)
for k in keys_to_check:
is_present = k in dist
val = dist.get(k)
print(f"{k:<35} | {str(is_present):<7} | {val}")
if __name__ == "__main__":
check_distribution()

View File

@@ -1,94 +0,0 @@
import sqlite3
import pandas as pd
import os
# Config
L2_DB_PATH = r'database/L2/L2_Main.sqlite'
def debug_player_data(username_pattern='jAckY'):
if not os.path.exists(L2_DB_PATH):
print(f"Error: L2 DB not found at {L2_DB_PATH}")
return
conn_l2 = sqlite3.connect(L2_DB_PATH)
print(f"--- Debugging Player: {username_pattern} ---")
try:
# 1. Find the player ID
q_id = f"SELECT steam_id_64, username FROM dim_players WHERE username LIKE '%{username_pattern}%'"
df_player = pd.read_sql_query(q_id, conn_l2)
if df_player.empty:
print("Player not found.")
return
target_id = df_player.iloc[0]['steam_id_64']
name = df_player.iloc[0]['username']
print(f"Found: {name} ({target_id})")
# 2. Check Match Stats (ADR, Rounds)
q_matches = f"""
SELECT match_id, round_total, adr, (adr * round_total) as damage_calc
FROM fact_match_players
WHERE steam_id_64 = '{target_id}'
"""
df_matches = pd.read_sql_query(q_matches, conn_l2)
total_dmg = df_matches['damage_calc'].sum()
total_rounds = df_matches['round_total'].sum()
print(f"\nMatch Stats:")
print(f"Matches Played: {len(df_matches)}")
print(f"Total Rounds: {total_rounds}")
print(f"Total Damage (Calc): {total_dmg:,.0f}")
# 3. Check Economy Stats (Spend)
q_eco = f"""
SELECT match_id, COUNT(*) as rounds_with_eco, SUM(equipment_value) as spend
FROM fact_round_player_economy
WHERE steam_id_64 = '{target_id}'
GROUP BY match_id
"""
df_eco = pd.read_sql_query(q_eco, conn_l2)
total_spend = df_eco['spend'].sum()
total_eco_rounds = df_eco['rounds_with_eco'].sum()
print(f"\nEconomy Stats:")
print(f"Matches with Eco Data: {len(df_eco)}")
print(f"Rounds with Eco Data: {total_eco_rounds}")
print(f"Total Spend: ${total_spend:,.0f}")
# 4. Compare
print(f"\nComparison:")
print(f"Rounds in Match Stats: {total_rounds}")
print(f"Rounds in Eco Stats: {total_eco_rounds}")
if total_eco_rounds < total_rounds:
print(f"⚠️ WARNING: Missing economy data for {total_rounds - total_eco_rounds} rounds!")
# Find matches with missing eco data
merged = df_matches.merge(df_eco, on='match_id', how='left')
missing = merged[merged['spend'].isna() | (merged['spend'] == 0)]
if not missing.empty:
print(f"\nMatches with ZERO spend/Missing Eco:")
print(missing[['match_id', 'round_total', 'damage_calc']])
# Check calculation impact
valid_dmg = merged[merged['spend'] > 0]['damage_calc'].sum()
print(f"\nRecalculation ignoring missing matches:")
print(f"Valid Damage: {valid_dmg:,.0f}")
print(f"Total Spend: ${total_spend:,.0f}")
if total_spend > 0:
new_val = valid_dmg / (total_spend / 1000)
print(f"Corrected Dmg/$1k: {new_val:.2f}")
except Exception as e:
print(f"Error: {e}")
finally:
conn_l2.close()
if __name__ == "__main__":
debug_player_data()