""" Clutch-IQ Training Pipeline (L2 -> L3 -> Model) This script: 1. Loads L1B Parquet snapshots. 2. Performs L2 Feature Engineering (aggregates player-level data to frame-level features). 3. Trains an XGBoost Classifier. 4. Evaluates the model. 5. Saves the model artifact. Usage: python src/training/train.py """ import os import glob import pandas as pd import numpy as np import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, log_loss, classification_report import joblib import logging import sys import json import sqlite3 # Import Spatial & Economy Engines sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from features.spatial import calculate_spatial_features from features.economy import calculate_economy_features from features.definitions import FEATURE_COLUMNS # Configuration DATA_DIR = "data/processed" MODEL_DIR = "models" MODEL_PATH = os.path.join(MODEL_DIR, "clutch_model_v1.json") L3_DB_PATH = os.path.join("database", "L3", "L3.db") L2_DB_PATH = os.path.join("database", "L2", "L2.db") TEST_SIZE = 0.2 RANDOM_STATE = 42 # Configure logging to output to stdout logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)] ) def load_data(data_dir): """Load all parquet files from the data directory.""" files = glob.glob(os.path.join(data_dir, "*.parquet")) if not files: raise FileNotFoundError(f"No parquet files found in {data_dir}") dfs = [] for f in files: logging.info(f"Loading {f}...") dfs.append(pd.read_parquet(f)) return pd.concat(dfs, ignore_index=True) def preprocess_features(df): """ L2 Feature Engineering: Convert player-level snapshots to frame-level features. Input: DataFrame with one row per player per tick. Output: DataFrame with one row per tick (frame) with aggregated features. """ logging.info("Starting feature engineering...") # 1. Drop rows with missing target (warmup rounds etc.) df = df.dropna(subset=['round_winner']).copy() # 2. Group by Frame (Match, Round, Time_Bin) # We use 'tick' as the unique identifier for a frame within a match # Grouping keys: ['match_id', 'round', 'tick'] # Define aggregation logic # We want: # - CT Alive Count # - T Alive Count # - CT Total Health # - T Total Health # - CT Equipment Value (approximate via weapon/armor?) - Let's stick to health/count first. # - Target: round_winner (should be same for all rows in a group) # Helper for one-hot encoding teams if needed, but here we just pivot # Create team-specific features # Team 2 = T, Team 3 = CT df['is_t'] = (df['team_num'] == 2).astype(int) df['is_ct'] = (df['team_num'] == 3).astype(int) # Calculate player specific metrics df['t_alive'] = df['is_t'] * df['is_alive'].astype(int) df['ct_alive'] = df['is_ct'] * df['is_alive'].astype(int) df['t_health'] = df['is_t'] * df['health'] df['ct_health'] = df['is_ct'] * df['health'] # Aggregate per frame group_cols = ['match_id', 'map_name', 'round', 'tick', 'round_winner', 'is_bomb_planted', 'site'] # Check if 'is_bomb_planted' and 'site' exist (compatibility with old data) if 'is_bomb_planted' not in df.columns: df['is_bomb_planted'] = 0 if 'site' not in df.columns: df['site'] = 0 agg_funcs = { 't_alive': 'sum', 'ct_alive': 'sum', 't_health': 'sum', 'ct_health': 'sum', 'game_time': 'first', # Game time is same for the frame } # GroupBy # Note: 'round_winner' is in group_cols because it's constant per group features_df = df.groupby(group_cols).agg(agg_funcs).reset_index() # 3. Add derived features features_df['health_diff'] = features_df['ct_health'] - features_df['t_health'] features_df['alive_diff'] = features_df['ct_alive'] - features_df['t_alive'] # 4. [NEW] Calculate Spatial Features logging.info("Calculating spatial features...") spatial_features = calculate_spatial_features(df) # 5. [NEW] Calculate Economy Features logging.info("Calculating economy features...") economy_features = calculate_economy_features(df) # Merge all features # Keys: match_id, round, tick features_df = pd.merge(features_df, spatial_features, on=['match_id', 'round', 'tick'], how='left') features_df = pd.merge(features_df, economy_features, on=['match_id', 'round', 'tick'], how='left') rating_map = {} try: if os.path.exists(L3_DB_PATH): conn = sqlite3.connect(L3_DB_PATH) cursor = conn.cursor() cursor.execute("SELECT steam_id_64, core_avg_rating FROM dm_player_features") rows = cursor.fetchall() conn.close() rating_map = {str(r[0]): float(r[1]) for r in rows if r and r[0] is not None and r[1] is not None} elif os.path.exists(L2_DB_PATH): conn = sqlite3.connect(L2_DB_PATH) cursor = conn.cursor() cursor.execute(""" SELECT steam_id_64, AVG(rating) as avg_rating FROM fact_match_players WHERE rating IS NOT NULL GROUP BY steam_id_64 """) rows = cursor.fetchall() conn.close() rating_map = {str(r[0]): float(r[1]) for r in rows if r and r[0] is not None and r[1] is not None} except Exception: rating_map = {} # 6. Player "clutch ability" proxy: experience (non-label, non-leaky) # player_experience = number of snapshot-rows observed for this steamid in the dataset df = df.copy() if 'player_rating' in df.columns: df['player_rating'] = pd.to_numeric(df['player_rating'], errors='coerce').fillna(0.0).astype('float32') elif 'rating' in df.columns: df['player_rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(0.0).astype('float32') elif 'steamid' in df.columns: df['player_rating'] = df['steamid'].astype(str).map(rating_map).fillna(0.0).astype('float32') else: df['player_rating'] = 0.0 group_keys = ['match_id', 'round', 'tick'] alive_df_for_rating = df[df['is_alive'] == True].copy() t_rating = ( alive_df_for_rating[alive_df_for_rating['team_num'] == 2] .groupby(group_keys)['player_rating'] .mean() .rename('t_player_rating') .reset_index() ) ct_rating = ( alive_df_for_rating[alive_df_for_rating['team_num'] == 3] .groupby(group_keys)['player_rating'] .mean() .rename('ct_player_rating') .reset_index() ) features_df = pd.merge(features_df, t_rating, on=group_keys, how='left') features_df = pd.merge(features_df, ct_rating, on=group_keys, how='left') if 'steamid' in df.columns: player_exp = df.groupby('steamid').size().rename('player_experience').reset_index() df_with_exp = pd.merge(df, player_exp, on='steamid', how='left') alive_df_for_exp = df_with_exp[df_with_exp['is_alive'] == True].copy() t_exp = ( alive_df_for_exp[alive_df_for_exp['team_num'] == 2] .groupby(group_keys)['player_experience'] .mean() .rename('t_player_experience') .reset_index() ) ct_exp = ( alive_df_for_exp[alive_df_for_exp['team_num'] == 3] .groupby(group_keys)['player_experience'] .mean() .rename('ct_player_experience') .reset_index() ) features_df = pd.merge(features_df, t_exp, on=group_keys, how='left') features_df = pd.merge(features_df, ct_exp, on=group_keys, how='left') else: features_df['t_player_experience'] = 0.0 features_df['ct_player_experience'] = 0.0 if 't_player_rating' not in features_df.columns: features_df['t_player_rating'] = 0.0 if 'ct_player_rating' not in features_df.columns: features_df['ct_player_rating'] = 0.0 # Fill NaN spatial/eco features features_df = features_df.fillna(0) logging.info(f"Generated {len(features_df)} frames for training.") return features_df def train_model(df): """Train XGBoost Classifier.""" # Features (X) and Target (y) feature_cols = FEATURE_COLUMNS target_col = 'round_winner' logging.info(f"Training features: {feature_cols}") # Split by match_id to ensure no data leakage between training and testing groups unique_matches = df['match_id'].unique() logging.info(f"Total matches found: {len(unique_matches)}") # Logic to ensure 15 training / 2 validation split as requested # If we have 17 matches, 2 matches is approx 0.1176 # If we have exactly 17 matches, we can use test_size=2/17 or just use integer 2 if supported by train_test_split (it is for int >= 1) test_size_param = 2 if len(unique_matches) >= 3 else 0.2 if len(unique_matches) < 2: logging.warning("Less than 2 matches found. Falling back to random frame split (potential leakage).") X = df[feature_cols] y = df[target_col].astype(int) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE) else: # Use integer for exact number of test samples if we want exactly 2 matches train_matches, test_matches = train_test_split(unique_matches, test_size=test_size_param, random_state=RANDOM_STATE) logging.info(f"Training matches ({len(train_matches)}): {train_matches}") logging.info(f"Testing matches ({len(test_matches)}): {test_matches}") train_df = df[df['match_id'].isin(train_matches)] test_df = df[df['match_id'].isin(test_matches)] X_train = train_df[feature_cols] y_train = train_df[target_col].astype(int) X_test = test_df[feature_cols] y_test = test_df[target_col].astype(int) # Init Model model = xgb.XGBClassifier( n_estimators=100, learning_rate=0.1, max_depth=5, objective='binary:logistic', use_label_encoder=False, eval_metric='logloss' ) # Train logging.info("Fitting model...") model.fit(X_train, y_train) # Save Test Set for Evaluation Script test_set_path = os.path.join("data", "processed", "test_set.parquet") logging.info(f"Saving validation set to {test_set_path}...") test_df.to_parquet(test_set_path) # Feature Importance (Optional: keep for training log context) importance = model.feature_importances_ feature_importance_df = pd.DataFrame({ 'Feature': feature_cols, 'Importance': importance }).sort_values(by='Importance', ascending=False) logging.info("\nTop 10 Important Features:") logging.info(feature_importance_df.head(10).to_string(index=False)) return model def main(): if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) try: # 1. Load raw_df = load_data(DATA_DIR) # 2. Preprocess features_df = preprocess_features(raw_df) if features_df.empty: logging.error("No data available for training after preprocessing.") return # 3. Train model = train_model(features_df) # 4. Save model.save_model(MODEL_PATH) logging.info(f"Model saved to {MODEL_PATH}") # 5. Save player experience map for inference (optional) if 'steamid' in raw_df.columns: exp_map = raw_df.groupby('steamid').size().to_dict() exp_path = os.path.join(MODEL_DIR, "player_experience.json") with open(exp_path, "w", encoding="utf-8") as f: json.dump(exp_map, f) logging.info(f"Player experience map saved to {exp_path}") except Exception as e: logging.error(f"Training failed: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()