341 lines
12 KiB
Python
341 lines
12 KiB
Python
|
|
"""
|
||
|
|
Clutch-IQ Training Pipeline (L2 -> L3 -> Model)
|
||
|
|
|
||
|
|
This script:
|
||
|
|
1. Loads L1B Parquet snapshots.
|
||
|
|
2. Performs L2 Feature Engineering (aggregates player-level data to frame-level features).
|
||
|
|
3. Trains an XGBoost Classifier.
|
||
|
|
4. Evaluates the model.
|
||
|
|
5. Saves the model artifact.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python src/training/train.py
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import glob
|
||
|
|
import pandas as pd
|
||
|
|
import numpy as np
|
||
|
|
import xgboost as xgb
|
||
|
|
from sklearn.model_selection import train_test_split
|
||
|
|
from sklearn.metrics import accuracy_score, log_loss, classification_report
|
||
|
|
import joblib
|
||
|
|
import logging
|
||
|
|
import sys
|
||
|
|
import json
|
||
|
|
import sqlite3
|
||
|
|
|
||
|
|
# Import Spatial & Economy Engines
|
||
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
||
|
|
from features.spatial import calculate_spatial_features
|
||
|
|
from features.economy import calculate_economy_features
|
||
|
|
from features.definitions import FEATURE_COLUMNS
|
||
|
|
|
||
|
|
# Configuration
|
||
|
|
DATA_DIR = "data/processed"
|
||
|
|
MODEL_DIR = "models"
|
||
|
|
MODEL_PATH = os.path.join(MODEL_DIR, "clutch_model_v1.json")
|
||
|
|
L3_DB_PATH = os.path.join("database", "L3", "L3.db")
|
||
|
|
L2_DB_PATH = os.path.join("database", "L2", "L2.db")
|
||
|
|
TEST_SIZE = 0.2
|
||
|
|
RANDOM_STATE = 42
|
||
|
|
|
||
|
|
# Configure logging to output to stdout
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
|
|
handlers=[logging.StreamHandler(sys.stdout)]
|
||
|
|
)
|
||
|
|
|
||
|
|
def load_data(data_dir):
|
||
|
|
"""Load all parquet files from the data directory."""
|
||
|
|
files = glob.glob(os.path.join(data_dir, "*.parquet"))
|
||
|
|
if not files:
|
||
|
|
raise FileNotFoundError(f"No parquet files found in {data_dir}")
|
||
|
|
|
||
|
|
dfs = []
|
||
|
|
for f in files:
|
||
|
|
logging.info(f"Loading {f}...")
|
||
|
|
dfs.append(pd.read_parquet(f))
|
||
|
|
|
||
|
|
return pd.concat(dfs, ignore_index=True)
|
||
|
|
|
||
|
|
def preprocess_features(df):
|
||
|
|
"""
|
||
|
|
L2 Feature Engineering: Convert player-level snapshots to frame-level features.
|
||
|
|
|
||
|
|
Input: DataFrame with one row per player per tick.
|
||
|
|
Output: DataFrame with one row per tick (frame) with aggregated features.
|
||
|
|
"""
|
||
|
|
logging.info("Starting feature engineering...")
|
||
|
|
|
||
|
|
# 1. Drop rows with missing target (warmup rounds etc.)
|
||
|
|
df = df.dropna(subset=['round_winner']).copy()
|
||
|
|
|
||
|
|
# 2. Group by Frame (Match, Round, Time_Bin)
|
||
|
|
# We use 'tick' as the unique identifier for a frame within a match
|
||
|
|
# Grouping keys: ['match_id', 'round', 'tick']
|
||
|
|
|
||
|
|
# Define aggregation logic
|
||
|
|
# We want:
|
||
|
|
# - CT Alive Count
|
||
|
|
# - T Alive Count
|
||
|
|
# - CT Total Health
|
||
|
|
# - T Total Health
|
||
|
|
# - CT Equipment Value (approximate via weapon/armor?) - Let's stick to health/count first.
|
||
|
|
# - Target: round_winner (should be same for all rows in a group)
|
||
|
|
|
||
|
|
# Helper for one-hot encoding teams if needed, but here we just pivot
|
||
|
|
|
||
|
|
# Create team-specific features
|
||
|
|
# Team 2 = T, Team 3 = CT
|
||
|
|
|
||
|
|
df['is_t'] = (df['team_num'] == 2).astype(int)
|
||
|
|
df['is_ct'] = (df['team_num'] == 3).astype(int)
|
||
|
|
|
||
|
|
# Calculate player specific metrics
|
||
|
|
df['t_alive'] = df['is_t'] * df['is_alive'].astype(int)
|
||
|
|
df['ct_alive'] = df['is_ct'] * df['is_alive'].astype(int)
|
||
|
|
|
||
|
|
df['t_health'] = df['is_t'] * df['health']
|
||
|
|
df['ct_health'] = df['is_ct'] * df['health']
|
||
|
|
|
||
|
|
# Aggregate per frame
|
||
|
|
group_cols = ['match_id', 'map_name', 'round', 'tick', 'round_winner', 'is_bomb_planted', 'site']
|
||
|
|
|
||
|
|
# Check if 'is_bomb_planted' and 'site' exist (compatibility with old data)
|
||
|
|
if 'is_bomb_planted' not in df.columns:
|
||
|
|
df['is_bomb_planted'] = 0
|
||
|
|
if 'site' not in df.columns:
|
||
|
|
df['site'] = 0
|
||
|
|
|
||
|
|
agg_funcs = {
|
||
|
|
't_alive': 'sum',
|
||
|
|
'ct_alive': 'sum',
|
||
|
|
't_health': 'sum',
|
||
|
|
'ct_health': 'sum',
|
||
|
|
'game_time': 'first', # Game time is same for the frame
|
||
|
|
}
|
||
|
|
|
||
|
|
# GroupBy
|
||
|
|
# Note: 'round_winner' is in group_cols because it's constant per group
|
||
|
|
features_df = df.groupby(group_cols).agg(agg_funcs).reset_index()
|
||
|
|
|
||
|
|
# 3. Add derived features
|
||
|
|
features_df['health_diff'] = features_df['ct_health'] - features_df['t_health']
|
||
|
|
features_df['alive_diff'] = features_df['ct_alive'] - features_df['t_alive']
|
||
|
|
|
||
|
|
# 4. [NEW] Calculate Spatial Features
|
||
|
|
logging.info("Calculating spatial features...")
|
||
|
|
spatial_features = calculate_spatial_features(df)
|
||
|
|
|
||
|
|
# 5. [NEW] Calculate Economy Features
|
||
|
|
logging.info("Calculating economy features...")
|
||
|
|
economy_features = calculate_economy_features(df)
|
||
|
|
|
||
|
|
# Merge all features
|
||
|
|
# Keys: match_id, round, tick
|
||
|
|
features_df = pd.merge(features_df, spatial_features, on=['match_id', 'round', 'tick'], how='left')
|
||
|
|
features_df = pd.merge(features_df, economy_features, on=['match_id', 'round', 'tick'], how='left')
|
||
|
|
|
||
|
|
rating_map = {}
|
||
|
|
try:
|
||
|
|
if os.path.exists(L3_DB_PATH):
|
||
|
|
conn = sqlite3.connect(L3_DB_PATH)
|
||
|
|
cursor = conn.cursor()
|
||
|
|
cursor.execute("SELECT steam_id_64, core_avg_rating FROM dm_player_features")
|
||
|
|
rows = cursor.fetchall()
|
||
|
|
conn.close()
|
||
|
|
rating_map = {str(r[0]): float(r[1]) for r in rows if r and r[0] is not None and r[1] is not None}
|
||
|
|
elif os.path.exists(L2_DB_PATH):
|
||
|
|
conn = sqlite3.connect(L2_DB_PATH)
|
||
|
|
cursor = conn.cursor()
|
||
|
|
cursor.execute("""
|
||
|
|
SELECT steam_id_64, AVG(rating) as avg_rating
|
||
|
|
FROM fact_match_players
|
||
|
|
WHERE rating IS NOT NULL
|
||
|
|
GROUP BY steam_id_64
|
||
|
|
""")
|
||
|
|
rows = cursor.fetchall()
|
||
|
|
conn.close()
|
||
|
|
rating_map = {str(r[0]): float(r[1]) for r in rows if r and r[0] is not None and r[1] is not None}
|
||
|
|
except Exception:
|
||
|
|
rating_map = {}
|
||
|
|
|
||
|
|
# 6. Player "clutch ability" proxy: experience (non-label, non-leaky)
|
||
|
|
# player_experience = number of snapshot-rows observed for this steamid in the dataset
|
||
|
|
df = df.copy()
|
||
|
|
if 'player_rating' in df.columns:
|
||
|
|
df['player_rating'] = pd.to_numeric(df['player_rating'], errors='coerce').fillna(0.0).astype('float32')
|
||
|
|
elif 'rating' in df.columns:
|
||
|
|
df['player_rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(0.0).astype('float32')
|
||
|
|
elif 'steamid' in df.columns:
|
||
|
|
df['player_rating'] = df['steamid'].astype(str).map(rating_map).fillna(0.0).astype('float32')
|
||
|
|
else:
|
||
|
|
df['player_rating'] = 0.0
|
||
|
|
|
||
|
|
group_keys = ['match_id', 'round', 'tick']
|
||
|
|
alive_df_for_rating = df[df['is_alive'] == True].copy()
|
||
|
|
t_rating = (
|
||
|
|
alive_df_for_rating[alive_df_for_rating['team_num'] == 2]
|
||
|
|
.groupby(group_keys)['player_rating']
|
||
|
|
.mean()
|
||
|
|
.rename('t_player_rating')
|
||
|
|
.reset_index()
|
||
|
|
)
|
||
|
|
ct_rating = (
|
||
|
|
alive_df_for_rating[alive_df_for_rating['team_num'] == 3]
|
||
|
|
.groupby(group_keys)['player_rating']
|
||
|
|
.mean()
|
||
|
|
.rename('ct_player_rating')
|
||
|
|
.reset_index()
|
||
|
|
)
|
||
|
|
features_df = pd.merge(features_df, t_rating, on=group_keys, how='left')
|
||
|
|
features_df = pd.merge(features_df, ct_rating, on=group_keys, how='left')
|
||
|
|
|
||
|
|
if 'steamid' in df.columns:
|
||
|
|
player_exp = df.groupby('steamid').size().rename('player_experience').reset_index()
|
||
|
|
df_with_exp = pd.merge(df, player_exp, on='steamid', how='left')
|
||
|
|
alive_df_for_exp = df_with_exp[df_with_exp['is_alive'] == True].copy()
|
||
|
|
|
||
|
|
t_exp = (
|
||
|
|
alive_df_for_exp[alive_df_for_exp['team_num'] == 2]
|
||
|
|
.groupby(group_keys)['player_experience']
|
||
|
|
.mean()
|
||
|
|
.rename('t_player_experience')
|
||
|
|
.reset_index()
|
||
|
|
)
|
||
|
|
ct_exp = (
|
||
|
|
alive_df_for_exp[alive_df_for_exp['team_num'] == 3]
|
||
|
|
.groupby(group_keys)['player_experience']
|
||
|
|
.mean()
|
||
|
|
.rename('ct_player_experience')
|
||
|
|
.reset_index()
|
||
|
|
)
|
||
|
|
|
||
|
|
features_df = pd.merge(features_df, t_exp, on=group_keys, how='left')
|
||
|
|
features_df = pd.merge(features_df, ct_exp, on=group_keys, how='left')
|
||
|
|
else:
|
||
|
|
features_df['t_player_experience'] = 0.0
|
||
|
|
features_df['ct_player_experience'] = 0.0
|
||
|
|
|
||
|
|
if 't_player_rating' not in features_df.columns:
|
||
|
|
features_df['t_player_rating'] = 0.0
|
||
|
|
if 'ct_player_rating' not in features_df.columns:
|
||
|
|
features_df['ct_player_rating'] = 0.0
|
||
|
|
|
||
|
|
# Fill NaN spatial/eco features
|
||
|
|
features_df = features_df.fillna(0)
|
||
|
|
|
||
|
|
logging.info(f"Generated {len(features_df)} frames for training.")
|
||
|
|
return features_df
|
||
|
|
|
||
|
|
def train_model(df):
|
||
|
|
"""Train XGBoost Classifier."""
|
||
|
|
|
||
|
|
# Features (X) and Target (y)
|
||
|
|
feature_cols = FEATURE_COLUMNS
|
||
|
|
target_col = 'round_winner'
|
||
|
|
|
||
|
|
logging.info(f"Training features: {feature_cols}")
|
||
|
|
|
||
|
|
# Split by match_id to ensure no data leakage between training and testing groups
|
||
|
|
unique_matches = df['match_id'].unique()
|
||
|
|
logging.info(f"Total matches found: {len(unique_matches)}")
|
||
|
|
|
||
|
|
# Logic to ensure 15 training / 2 validation split as requested
|
||
|
|
# If we have 17 matches, 2 matches is approx 0.1176
|
||
|
|
# If we have exactly 17 matches, we can use test_size=2/17 or just use integer 2 if supported by train_test_split (it is for int >= 1)
|
||
|
|
|
||
|
|
test_size_param = 2 if len(unique_matches) >= 3 else 0.2
|
||
|
|
|
||
|
|
if len(unique_matches) < 2:
|
||
|
|
logging.warning("Less than 2 matches found. Falling back to random frame split (potential leakage).")
|
||
|
|
X = df[feature_cols]
|
||
|
|
y = df[target_col].astype(int)
|
||
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
|
||
|
|
else:
|
||
|
|
# Use integer for exact number of test samples if we want exactly 2 matches
|
||
|
|
train_matches, test_matches = train_test_split(unique_matches, test_size=test_size_param, random_state=RANDOM_STATE)
|
||
|
|
|
||
|
|
logging.info(f"Training matches ({len(train_matches)}): {train_matches}")
|
||
|
|
logging.info(f"Testing matches ({len(test_matches)}): {test_matches}")
|
||
|
|
|
||
|
|
train_df = df[df['match_id'].isin(train_matches)]
|
||
|
|
test_df = df[df['match_id'].isin(test_matches)]
|
||
|
|
|
||
|
|
X_train = train_df[feature_cols]
|
||
|
|
y_train = train_df[target_col].astype(int)
|
||
|
|
|
||
|
|
X_test = test_df[feature_cols]
|
||
|
|
y_test = test_df[target_col].astype(int)
|
||
|
|
|
||
|
|
# Init Model
|
||
|
|
model = xgb.XGBClassifier(
|
||
|
|
n_estimators=100,
|
||
|
|
learning_rate=0.1,
|
||
|
|
max_depth=5,
|
||
|
|
objective='binary:logistic',
|
||
|
|
use_label_encoder=False,
|
||
|
|
eval_metric='logloss'
|
||
|
|
)
|
||
|
|
|
||
|
|
# Train
|
||
|
|
logging.info("Fitting model...")
|
||
|
|
model.fit(X_train, y_train)
|
||
|
|
|
||
|
|
# Save Test Set for Evaluation Script
|
||
|
|
test_set_path = os.path.join("data", "processed", "test_set.parquet")
|
||
|
|
logging.info(f"Saving validation set to {test_set_path}...")
|
||
|
|
test_df.to_parquet(test_set_path)
|
||
|
|
|
||
|
|
# Feature Importance (Optional: keep for training log context)
|
||
|
|
importance = model.feature_importances_
|
||
|
|
feature_importance_df = pd.DataFrame({
|
||
|
|
'Feature': feature_cols,
|
||
|
|
'Importance': importance
|
||
|
|
}).sort_values(by='Importance', ascending=False)
|
||
|
|
|
||
|
|
logging.info("\nTop 10 Important Features:")
|
||
|
|
logging.info(feature_importance_df.head(10).to_string(index=False))
|
||
|
|
|
||
|
|
return model
|
||
|
|
|
||
|
|
def main():
|
||
|
|
if not os.path.exists(MODEL_DIR):
|
||
|
|
os.makedirs(MODEL_DIR)
|
||
|
|
|
||
|
|
try:
|
||
|
|
# 1. Load
|
||
|
|
raw_df = load_data(DATA_DIR)
|
||
|
|
|
||
|
|
# 2. Preprocess
|
||
|
|
features_df = preprocess_features(raw_df)
|
||
|
|
|
||
|
|
if features_df.empty:
|
||
|
|
logging.error("No data available for training after preprocessing.")
|
||
|
|
return
|
||
|
|
|
||
|
|
# 3. Train
|
||
|
|
model = train_model(features_df)
|
||
|
|
|
||
|
|
# 4. Save
|
||
|
|
model.save_model(MODEL_PATH)
|
||
|
|
logging.info(f"Model saved to {MODEL_PATH}")
|
||
|
|
|
||
|
|
# 5. Save player experience map for inference (optional)
|
||
|
|
if 'steamid' in raw_df.columns:
|
||
|
|
exp_map = raw_df.groupby('steamid').size().to_dict()
|
||
|
|
exp_path = os.path.join(MODEL_DIR, "player_experience.json")
|
||
|
|
with open(exp_path, "w", encoding="utf-8") as f:
|
||
|
|
json.dump(exp_map, f)
|
||
|
|
logging.info(f"Player experience map saved to {exp_path}")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logging.error(f"Training failed: {e}")
|
||
|
|
import traceback
|
||
|
|
traceback.print_exc()
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|