368 lines
16 KiB
Python
368 lines
16 KiB
Python
"""
|
||
L1B 快照引擎 (Parquet 版本)
|
||
|
||
这是第一阶段 (Phase 1) 的核心 ETL 脚本。
|
||
它负责从 CS2 .dem 文件中提取 Tick 级别的快照,并将其保存为高压缩率的 Parquet 文件。
|
||
|
||
用法:
|
||
python src/etl/extract_snapshots.py --demo_dir data/demos --output_dir data/processed
|
||
|
||
配置:
|
||
调整下方的参数以控制数据粒度
|
||
"""
|
||
|
||
import os
|
||
import argparse
|
||
import pandas as pd
|
||
import numpy as np
|
||
from demoparser2 import DemoParser # 核心依赖
|
||
import logging
|
||
import sys
|
||
|
||
# ==============================================================================
|
||
# ⚙️ 配置与调优参数 (可修改参数区)
|
||
# ==============================================================================
|
||
|
||
# [重要] 采样率
|
||
# 多久截取一次快照?
|
||
# 较低值 = 数据更多,精度更高,处理更慢。
|
||
# 较高值 = 数据更少,处理更快。
|
||
SNAPSHOT_INTERVAL_SECONDS = 2 # 👈 建议值: 1-5秒 (默认: 2s)
|
||
|
||
# [重要] 回合过滤器
|
||
# 包含哪些回合?
|
||
# 'clutch_only': 仅保留发生残局 (<= 3v3) 的回合。
|
||
# 'all': 保留所有回合 (数据集会非常巨大)。
|
||
FILTER_MODE = 'clutch_only' # 👈 选项: 'all' | 'clutch_only'
|
||
|
||
# [重要] 残局定义
|
||
# 什么样的局面算作“残局”?
|
||
MAX_PLAYERS_PER_TEAM = 2 # 👈 建议值: 2 (意味着 <= 2vX 或 Xv2)
|
||
|
||
# 字段选择 (用于优化)
|
||
# 仅从 demo 中提取这些字段以节省内存
|
||
WANTED_FIELDS = [
|
||
"game_time", # 游戏时间
|
||
"team_num", # 队伍编号
|
||
"player_name", # 玩家昵称
|
||
"steamid", # Steam ID
|
||
"X", "Y", "Z", # 坐标位置
|
||
"view_X", "view_Y", # 视角角度
|
||
"health", # 生命值
|
||
"armor_value", # 护甲值
|
||
"has_defuser", # 是否有拆弹钳
|
||
"has_helmet", # 是否有头盔
|
||
"active_weapon_name", # 当前手持武器
|
||
"flash_duration", # 致盲持续时间 (是否被白)
|
||
"is_alive", # 是否存活
|
||
"balance" # [NEW] 剩余金钱 (Correct field name)
|
||
]
|
||
|
||
# ==============================================================================
|
||
# 配置结束
|
||
# ==============================================================================
|
||
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||
|
||
def is_clutch_situation(ct_alive, t_alive):
|
||
"""
|
||
检查当前状态是否符合“残局”场景。
|
||
条件: 至少有一方队伍的存活人数 <= MAX_PLAYERS_PER_TEAM。
|
||
(例如: 2v5 对于剩2人的那队来说就是残局)
|
||
"""
|
||
if ct_alive == 0 or t_alive == 0:
|
||
return False
|
||
|
||
# 用户需求: "对面有几个人都无所谓,只要一方剩两个人"
|
||
# 含义: 如果 CT <= N 或者 T <= N,即视为残局。
|
||
is_ct_clutch = (ct_alive <= MAX_PLAYERS_PER_TEAM)
|
||
is_t_clutch = (t_alive <= MAX_PLAYERS_PER_TEAM)
|
||
|
||
return is_ct_clutch or is_t_clutch
|
||
|
||
def process_demo(demo_path, output_dir, delete_source=False):
|
||
"""
|
||
解析单个 .dem 文件并将快照导出为 Parquet 格式。
|
||
"""
|
||
demo_name = os.path.basename(demo_path).replace('.dem', '')
|
||
output_path = os.path.join(output_dir, f"{demo_name}.parquet")
|
||
|
||
if os.path.exists(output_path):
|
||
logging.info(f"跳过 {demo_name}, 文件已存在。")
|
||
if delete_source:
|
||
try:
|
||
os.remove(demo_path)
|
||
logging.info(f"已删除源文件 (因为已存在处理结果): {demo_path}")
|
||
except Exception as e:
|
||
logging.warning(f"删除源文件失败: {e}")
|
||
return
|
||
|
||
logging.info(f"正在处理: {demo_name}")
|
||
|
||
try:
|
||
parser = DemoParser(demo_path)
|
||
|
||
# 1. 解析元数据 (地图, 头部信息)
|
||
header = parser.parse_header()
|
||
map_name = header.get("map_name", "unknown")
|
||
|
||
# 2. 提取事件 (回合开始/结束, 炸弹) 以识别回合边界
|
||
# [修复] 解析 round_start 事件以获取 round 信息,解决 KeyError: 'round'
|
||
# [新增] 解析 round_end 事件以获取 round_winner 信息
|
||
# [新增] 解析 bomb 事件以获取 is_bomb_planted 和 bomb_site
|
||
event_names = ["round_start", "round_end", "bomb_planted", "bomb_defused", "bomb_exploded"]
|
||
parsed_events = parser.parse_events(event_names)
|
||
|
||
round_df = None
|
||
winner_df = None
|
||
bomb_events = []
|
||
|
||
# parse_events 返回 [(event_name, df), ...]
|
||
for event_name, event_data in parsed_events:
|
||
if event_name == "round_start":
|
||
round_df = event_data
|
||
elif event_name == "round_end":
|
||
winner_df = event_data
|
||
elif event_name in ["bomb_planted", "bomb_defused", "bomb_exploded"]:
|
||
# 统一处理炸弹事件
|
||
# bomb_planted 有 site 字段
|
||
# 其他可能没有,需要填充
|
||
temp_df = event_data.copy()
|
||
temp_df['event_type'] = event_name
|
||
if 'site' not in temp_df.columns:
|
||
temp_df['site'] = 0
|
||
bomb_events.append(temp_df[['tick', 'event_type', 'site']])
|
||
|
||
# 3. 提取玩家状态 (繁重的工作)
|
||
# 我们先获取所有 Tick 的数据,然后再进行过滤
|
||
df = parser.parse_ticks(WANTED_FIELDS)
|
||
|
||
# [修复] 将 Round 信息合并到 DataFrame
|
||
if round_df is not None and not round_df.empty:
|
||
# 确保按 tick 排序
|
||
round_df = round_df.sort_values('tick')
|
||
df = df.sort_values('tick')
|
||
|
||
# 使用 merge_asof 将最近的 round_start 匹配给每个 tick
|
||
# direction='backward' 意味着找 tick <= 当前tick 的最近一次 round_start
|
||
df = pd.merge_asof(df, round_df[['tick', 'round']], on='tick', direction='backward')
|
||
|
||
# 填充 NaN (比赛开始前的 tick) 为 0
|
||
df['round'] = df['round'].fillna(0).astype(int)
|
||
else:
|
||
logging.warning(f"在 {demo_name} 中未找到 round_start 事件,默认为第 1 回合")
|
||
df['round'] = 1
|
||
|
||
# [新增] 将 Winner 信息合并到 DataFrame
|
||
if winner_df is not None and not winner_df.empty:
|
||
# winner_df 包含 'round' 和 'winner'
|
||
# 这里的 'round' 是结束的回合号。
|
||
# 我们直接将 winner 映射到 df 中的 round 列
|
||
|
||
# 清洗 winner 数据 (T -> 0, CT -> 1)
|
||
# 注意: demoparser2 返回的 winner 可能是 int (2/3) 也可能是 str ('T'/'CT')
|
||
# 我们先统一转为字符串处理
|
||
winner_map = df[['round']].copy().drop_duplicates()
|
||
|
||
# 建立 round -> winner 字典
|
||
# 过滤无效的 winner
|
||
valid_winners = winner_df.dropna(subset=['winner'])
|
||
round_winner_dict = {}
|
||
|
||
for _, row in valid_winners.iterrows():
|
||
r = row['round']
|
||
w = row['winner']
|
||
if w == 'T' or w == 2:
|
||
round_winner_dict[r] = 0 # T wins
|
||
elif w == 'CT' or w == 3:
|
||
round_winner_dict[r] = 1 # CT wins
|
||
|
||
# 映射到主 DataFrame
|
||
df['round_winner'] = df['round'].map(round_winner_dict)
|
||
|
||
# 移除没有结果的回合 (例如 warmup 或未结束的回合)
|
||
# df = df.dropna(subset=['round_winner']) # 暂时保留,由后续步骤决定是否丢弃
|
||
else:
|
||
logging.warning(f"在 {demo_name} 中未找到 round_end 事件,无法标记胜者")
|
||
df['round_winner'] = None
|
||
|
||
# [新增] 合并炸弹状态 (is_bomb_planted)
|
||
if bomb_events:
|
||
bomb_df = pd.concat(bomb_events).sort_values('tick')
|
||
|
||
# 逻辑:
|
||
# bomb_planted -> is_planted=1, site=X
|
||
# bomb_defused/exploded -> is_planted=0, site=0
|
||
# round_start/end -> 也可以作为重置点 (state=0),但我们没有把它们放入 bomb_events
|
||
# 我们假设 round_start 时炸弹肯定没下,但 merge_asof 会延续上一个状态
|
||
# 所以我们需要把 round_start 也加入作为重置事件
|
||
|
||
if round_df is not None:
|
||
reset_df = round_df[['tick']].copy()
|
||
reset_df['event_type'] = 'reset'
|
||
reset_df['site'] = 0
|
||
bomb_df = pd.concat([bomb_df, reset_df]).sort_values('tick')
|
||
|
||
# 计算状态
|
||
# 1 = Planted, 0 = Not Planted
|
||
bomb_df['is_bomb_planted'] = bomb_df['event_type'].apply(lambda x: 1 if x == 'bomb_planted' else 0)
|
||
# site 已经在 bomb_planted 事件中有值,其他为 0
|
||
|
||
# 使用 merge_asof 传播状态
|
||
# 注意:bomb_df 可能有同一 tick 多个事件,merge_asof 取最后一个
|
||
# 所以我们要确保排序正确 (reset 应该在 planted 之前?不,reset 是 round_start,肯定在 planted 之前)
|
||
|
||
# 只需要 tick, is_bomb_planted, site
|
||
state_df = bomb_df[['tick', 'is_bomb_planted', 'site']].copy()
|
||
|
||
df = pd.merge_asof(df, state_df, on='tick', direction='backward')
|
||
|
||
# 填充 NaN 为 0 (未下包)
|
||
df['is_bomb_planted'] = df['is_bomb_planted'].fillna(0).astype(int)
|
||
df['site'] = df['site'].fillna(0).astype(int)
|
||
else:
|
||
df['is_bomb_planted'] = 0
|
||
df['site'] = 0
|
||
|
||
# 4. 数据清洗与优化
|
||
# 将 team_num 转换为 int (CT=3, T=2)
|
||
df['team_num'] = df['team_num'].fillna(0).astype(int)
|
||
|
||
# 5. 应用采样间隔过滤器
|
||
# 我们不需要每一帧 (128/s),而是每 N 秒取一帧
|
||
# 近似计算: tick_rate 大约是 64 或 128。
|
||
# 我们使用 'game_time' 来过滤。
|
||
df['time_bin'] = (df['game_time'] // SNAPSHOT_INTERVAL_SECONDS).astype(int)
|
||
|
||
# [修复] 采样逻辑优化:找出每个 (round, time_bin) 的起始 tick,保留该 tick 的所有玩家数据
|
||
# 旧逻辑 groupby().first() 会丢失其他玩家数据
|
||
bin_start_ticks = df.groupby(['round', 'time_bin'])['tick'].min()
|
||
selected_ticks = bin_start_ticks.values
|
||
|
||
# 提取快照 (包含被选中 tick 的所有玩家行)
|
||
snapshot_df = df[df['tick'].isin(selected_ticks)].copy()
|
||
|
||
# 6. 应用残局逻辑过滤器
|
||
if FILTER_MODE == 'clutch_only':
|
||
# 我们需要计算每一帧各队的存活人数
|
||
# snapshot_df 已经是采样后的数据 (每个 tick 包含所有玩家)
|
||
|
||
# 高效的存活人数计算:
|
||
alive_counts = snapshot_df[snapshot_df['is_alive'] == True].groupby(['round', 'time_bin', 'team_num']).size().unstack(fill_value=0)
|
||
|
||
# 确保列存在 (2=T, 3=CT)
|
||
if 2 not in alive_counts.columns: alive_counts[2] = 0
|
||
if 3 not in alive_counts.columns: alive_counts[3] = 0
|
||
|
||
# 过滤出满足残局条件的帧
|
||
# alive_counts 的索引是 (round, time_bin)
|
||
clutch_mask = [is_clutch_situation(row[3], row[2]) for index, row in alive_counts.iterrows()]
|
||
valid_indices = alive_counts[clutch_mask].index
|
||
|
||
# 过滤主 DataFrame
|
||
# 构建一个复合键用于快速过滤
|
||
snapshot_df['frame_id'] = list(zip(snapshot_df['round'], snapshot_df['time_bin']))
|
||
valid_frame_ids = set(valid_indices)
|
||
|
||
snapshot_df = snapshot_df[snapshot_df['frame_id'].isin(valid_frame_ids)].copy()
|
||
snapshot_df.drop(columns=['frame_id'], inplace=True)
|
||
|
||
if snapshot_df.empty:
|
||
logging.warning(f"在 {demo_name} 中未找到有效快照 (过滤器: {FILTER_MODE})")
|
||
return
|
||
|
||
# 7. 添加元数据
|
||
snapshot_df['match_id'] = demo_name
|
||
snapshot_df['map_name'] = map_name
|
||
|
||
# [优化] 数据类型降维与压缩
|
||
# 这一步能显著减少内存占用和文件体积
|
||
|
||
# Float64 -> Float32
|
||
float_cols = ['X', 'Y', 'Z', 'view_X', 'view_Y', 'game_time', 'flash_duration']
|
||
for col in float_cols:
|
||
if col in snapshot_df.columns:
|
||
snapshot_df[col] = snapshot_df[col].astype('float32')
|
||
|
||
# Int64 -> Int8/Int16
|
||
# team_num: 2 or 3 -> int8
|
||
snapshot_df['team_num'] = snapshot_df['team_num'].astype('int8')
|
||
|
||
# health, armor: 0-100 -> int16 (uint8 也可以但 pandas 对 uint 支持有时候有坑)
|
||
for col in ['health', 'armor_value', 'balance', 'site']:
|
||
if col in snapshot_df.columns:
|
||
snapshot_df[col] = snapshot_df[col].fillna(0).astype('int16')
|
||
|
||
# round, tick: int32 (enough for millions)
|
||
snapshot_df['round'] = snapshot_df['round'].astype('int16')
|
||
snapshot_df['tick'] = snapshot_df['tick'].astype('int32')
|
||
|
||
# Booleans -> int8 or bool
|
||
bool_cols = ['is_alive', 'has_defuser', 'has_helmet', 'is_bomb_planted']
|
||
for col in bool_cols:
|
||
if col in snapshot_df.columns:
|
||
snapshot_df[col] = snapshot_df[col].astype('int8') # 0/1 is better for ML sometimes
|
||
|
||
# Drop redundant columns
|
||
if 'time_bin' in snapshot_df.columns:
|
||
snapshot_df.drop(columns=['time_bin'], inplace=True)
|
||
|
||
# 8. 保存为 Parquet (L1B 层)
|
||
# 使用 zstd 压缩算法,通常比 snappy 压缩率高 30-50%
|
||
snapshot_df.to_parquet(output_path, index=False, compression='zstd')
|
||
logging.info(f"已保存 {len(snapshot_df)} 条快照到 {output_path} (压缩模式: ZSTD)")
|
||
|
||
# [NEW] 删除源文件逻辑
|
||
if delete_source:
|
||
try:
|
||
os.remove(demo_path)
|
||
logging.info(f"处理成功,已删除源文件: {demo_path}")
|
||
except Exception as e:
|
||
logging.warning(f"删除源文件失败: {e}")
|
||
|
||
except Exception as e:
|
||
logging.error(f"处理失败 {demo_name}: {str(e)}")
|
||
# 如果是 Source 1 错误,给予明确提示
|
||
if "Source1" in str(e):
|
||
logging.error("❌ 这是一个 CS:GO (Source 1) 的 Demo,本系统仅支持 CS2 (Source 2) Demo。")
|
||
sys.exit(1)
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="L1B 快照引擎")
|
||
parser.add_argument('--demo_dir', type=str, default='data/demos', help='输入 .dem 文件的目录')
|
||
parser.add_argument('--file', type=str, help='处理单个 .dem 文件 (如果指定此参数,将忽略 --demo_dir)')
|
||
parser.add_argument('--output_dir', type=str, default='data/processed', help='输出 .parquet 文件的目录')
|
||
parser.add_argument('--delete-source', action='store_true', help='处理成功后删除源文件')
|
||
args = parser.parse_args()
|
||
|
||
if not os.path.exists(args.output_dir):
|
||
os.makedirs(args.output_dir)
|
||
|
||
# 模式 1: 单文件处理
|
||
if args.file:
|
||
if not os.path.exists(args.file):
|
||
logging.error(f"文件不存在: {args.file}")
|
||
return
|
||
if not args.file.endswith('.dem'):
|
||
logging.error(f"无效的文件扩展名: {args.file}")
|
||
return
|
||
process_demo(args.file, args.output_dir, delete_source=args.delete_source)
|
||
return
|
||
|
||
# 模式 2: 目录批处理
|
||
if not os.path.exists(args.demo_dir):
|
||
logging.warning(f"目录不存在: {args.demo_dir}")
|
||
return
|
||
|
||
demo_files = [os.path.join(args.demo_dir, f) for f in os.listdir(args.demo_dir) if f.endswith('.dem')]
|
||
|
||
if not demo_files:
|
||
logging.warning(f"在 {args.demo_dir} 中未找到 .dem 文件。请添加 demo 文件。")
|
||
return
|
||
|
||
for demo_path in demo_files:
|
||
process_demo(demo_path, args.output_dir, delete_source=args.delete_source)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|