简化项目并切换到v2模型与场景报告

2026-02-12 16:32:45 +08:00
parent 706940d8d3
commit a19da4728b
23 changed files with 454 additions and 1235 deletions
--- a/src/training/train.py
+++ b/src/training/train.py
@@ -29,12 +29,12 @@ import sqlite3
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from features.spatial import calculate_spatial_features
 from features.economy import calculate_economy_features
-from features.definitions import FEATURE_COLUMNS
+from features.definitions import FEATURE_COLUMNS, XGB_FEATURE_COLUMNS_V2

 # Configuration
 DATA_DIR = "data/processed"
 MODEL_DIR = "models"
-MODEL_PATH = os.path.join(MODEL_DIR, "clutch_model_v1.json")
+MODEL_PATH = os.path.join(MODEL_DIR, "clutch_model_v2.json")
 L3_DB_PATH = os.path.join("database", "L3", "L3.db")
 L2_DB_PATH = os.path.join("database", "L2", "L2.db")
 TEST_SIZE = 0.2
@@ -102,6 +102,36 @@ def preprocess_features(df):
    
    df['t_health'] = df['is_t'] * df['health']
    df['ct_health'] = df['is_ct'] * df['health']
+
+    if 'flash_duration' not in df.columns:
+        df['flash_duration'] = 0.0
+    df['flash_duration'] = pd.to_numeric(df['flash_duration'], errors='coerce').fillna(0.0).astype('float32')
+
+    if 'has_defuser' not in df.columns:
+        df['has_defuser'] = 0
+    df['has_defuser'] = df['has_defuser'].fillna(0).astype(int)
+
+    if 'has_helmet' not in df.columns:
+        df['has_helmet'] = 0
+    df['has_helmet'] = df['has_helmet'].fillna(0).astype(int)
+
+    if 'armor_value' not in df.columns:
+        df['armor_value'] = 0
+    df['armor_value'] = pd.to_numeric(df['armor_value'], errors='coerce').fillna(0.0).astype('float32')
+
+    is_alive_int = df['is_alive'].astype(int)
+    is_blinded = ((df['flash_duration'] > 0).astype(int) * is_alive_int).astype(int)
+
+    df['t_blinded_count_p'] = df['is_t'] * is_blinded
+    df['ct_blinded_count_p'] = df['is_ct'] * is_blinded
+    df['t_blind_time_sum_p'] = df['is_t'] * is_alive_int * df['flash_duration']
+    df['ct_blind_time_sum_p'] = df['is_ct'] * is_alive_int * df['flash_duration']
+
+    df['ct_defuser_count_p'] = df['is_ct'] * is_alive_int * df['has_defuser']
+    df['t_helmet_count_p'] = df['is_t'] * is_alive_int * df['has_helmet']
+    df['ct_helmet_count_p'] = df['is_ct'] * is_alive_int * df['has_helmet']
+    df['t_armor_sum_p'] = df['is_t'] * is_alive_int * df['armor_value']
+    df['ct_armor_sum_p'] = df['is_ct'] * is_alive_int * df['armor_value']
    
    # Aggregate per frame
    group_cols = ['match_id', 'map_name', 'round', 'tick', 'round_winner', 'is_bomb_planted', 'site']
@@ -124,6 +154,36 @@ def preprocess_features(df):
    # Note: 'round_winner' is in group_cols because it's constant per group
    features_df = df.groupby(group_cols).agg(agg_funcs).reset_index()
    
+    utility_agg = (
+        df.groupby(['match_id', 'round', 'tick'])
+        .agg({
+            't_blinded_count_p': 'sum',
+            'ct_blinded_count_p': 'sum',
+            't_blind_time_sum_p': 'sum',
+            'ct_blind_time_sum_p': 'sum',
+            'ct_defuser_count_p': 'sum',
+            't_helmet_count_p': 'sum',
+            'ct_helmet_count_p': 'sum',
+            't_armor_sum_p': 'sum',
+            'ct_armor_sum_p': 'sum'
+        })
+        .reset_index()
+        .rename(columns={
+            't_blinded_count_p': 't_blinded_count',
+            'ct_blinded_count_p': 'ct_blinded_count',
+            't_blind_time_sum_p': 't_blind_time_sum',
+            'ct_blind_time_sum_p': 'ct_blind_time_sum',
+            'ct_defuser_count_p': 'ct_defuser_count',
+            't_helmet_count_p': 't_helmet_count',
+            'ct_helmet_count_p': 'ct_helmet_count',
+            't_armor_sum_p': 't_armor_sum',
+            'ct_armor_sum_p': 'ct_armor_sum'
+        })
+    )
+
+    utility_agg['ct_has_defuser'] = (utility_agg['ct_defuser_count'] > 0).astype(int)
+    utility_agg['blinded_diff'] = utility_agg['ct_blinded_count'] - utility_agg['t_blinded_count']
+
    # 3. Add derived features
    features_df['health_diff'] = features_df['ct_health'] - features_df['t_health']
    features_df['alive_diff'] = features_df['ct_alive'] - features_df['t_alive']
@@ -140,6 +200,7 @@ def preprocess_features(df):
    # Keys: match_id, round, tick
    features_df = pd.merge(features_df, spatial_features, on=['match_id', 'round', 'tick'], how='left')
    features_df = pd.merge(features_df, economy_features, on=['match_id', 'round', 'tick'], how='left')
+    features_df = pd.merge(features_df, utility_agg, on=['match_id', 'round', 'tick'], how='left')

    rating_map = {}
    try:
@@ -237,7 +298,7 @@ def train_model(df):
    """Train XGBoost Classifier."""
    
    # Features (X) and Target (y)
-    feature_cols = FEATURE_COLUMNS
+    feature_cols = XGB_FEATURE_COLUMNS_V2
    target_col = 'round_winner'
    
    logging.info(f"Training features: {feature_cols}")
@@ -288,7 +349,7 @@ def train_model(df):
    model.fit(X_train, y_train)
    
    # Save Test Set for Evaluation Script
-    test_set_path = os.path.join("data", "processed", "test_set.parquet")
+    test_set_path = os.path.join("data", "processed", "test_set_v2.parquet")
    logging.info(f"Saving validation set to {test_set_path}...")
    test_df.to_parquet(test_set_path)
    
@@ -309,6 +370,8 @@ def main():
        os.makedirs(MODEL_DIR)
        
    try:
+        model_path = os.getenv("CLUTCH_XGB_MODEL_PATH", MODEL_PATH)
+
        # 1. Load
        raw_df = load_data(DATA_DIR)
        
@@ -323,8 +386,8 @@ def main():
        model = train_model(features_df)
        
        # 4. Save
-        model.save_model(MODEL_PATH)
-        logging.info(f"Model saved to {MODEL_PATH}")
+        model.save_model(model_path)
+        logging.info(f"Model saved to {model_path}")

        # 5. Save player experience map for inference (optional)
        if 'steamid' in raw_df.columns: