简化项目并切换到v2模型与场景报告

This commit is contained in:
xunyulin230420
2026-02-12 16:32:45 +08:00
parent 706940d8d3
commit a19da4728b
23 changed files with 454 additions and 1235 deletions

10
.gitignore vendored
View File

@@ -17,10 +17,14 @@ database/**/*.db
# Local demo snapshots (large) # Local demo snapshots (large)
data/processed/ data/processed/
data/demos/ data/demos/
data/sequences/
# Reports and Artifacts # Training / evaluation artifacts
reports/ data/sequences/
models/*.pth
models/player_experience.json
report_out.txt
run_log.txt
train_log.txt
# Local downloads / raw captures # Local downloads / raw captures
output_arena/ output_arena/

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1 +0,0 @@
{"76561197960690195": 1507, "76561197973140692": 670, "76561197975129851": 795, "76561197978835160": 670, "76561197989744167": 670, "76561197991272318": 670, "76561197995889730": 795, "76561197996678278": 5025, "76561198012872053": 724, "76561198013295375": 509, "76561198031890115": 509, "76561198041683378": 5025, "76561198045739761": 509, "76561198047472534": 795, "76561198057282432": 5025, "76561198058500492": 1507, "76561198060483793": 795, "76561198063336407": 820, "76561198068002993": 724, "76561198074762801": 5025, "76561198080703143": 724, "76561198113666193": 670, "76561198134401925": 1507, "76561198138828475": 820, "76561198164970560": 1507, "76561198168198200": 509, "76561198179538505": 509, "76561198193174134": 820, "76561198200982290": 1507, "76561198309839541": 724, "76561198353869335": 795, "76561198355739212": 820, "76561198855375325": 820, "76561199032006224": 4355, "76561199046478501": 724, "76561199091825101": 670}

View File

@@ -0,0 +1,26 @@
# v2 模型:不同场景下的指标报告
- 模型文件:`models/clutch_model_v2.json`
- 测试数据:`data/processed/test_set_v2.parquet`
- 帧数:`487`
说明:下表的 Precision/Recall/F1 均以 CT 胜利(=1) 作为正类。
| 类别 | 子类别 | 样本数 | 准确率(Acc) | 精确率(Prec) | 召回率(Rec) | F1 |
|---|---:|---:|---:|---:|---:|---:|
| Overall | All Data | 487 | 75.2% | 81.0% | 85.4% | 83.2% |
| Side | T Win (Terrorist) | 137 | 48.9% | 0.0% | 0.0% | 0.0% |
| Side | CT Win (Counter-Terrorist) | 350 | 85.4% | 100.0% | 85.4% | 92.1% |
| Player Count | Advantage (CT > T) | 247 | 76.5% | 72.1% | 97.9% | 83.0% |
| Player Count | Disadvantage (CT < T) | 188 | 73.9% | 99.2% | 72.7% | 83.9% |
| Player Count | Equal (CT == T) | 52 | 73.1% | 67.4% | 100.0% | 80.6% |
| Bomb Status | Not Planted | 326 | 69.6% | 72.5% | 82.2% | 77.0% |
| Bomb Status | Planted | 161 | 86.3% | 95.0% | 89.9% | 92.4% |
| Bomb Site | A | 161 | 86.3% | 95.0% | 89.9% | 92.4% |
| CT Defuser | No Kit | 118 | 50.8% | 71.2% | 46.2% | 56.1% |
| CT Defuser | Has Kit | 369 | 82.9% | 82.6% | 97.0% | 89.3% |
| Flash | No Flash | 452 | 73.5% | 80.0% | 84.4% | 82.1% |
| Flash | Any Flash | 35 | 97.1% | 95.8% | 100.0% | 97.9% |
| Time Phase | Early | 163 | 65.6% | 64.9% | 96.0% | 77.4% |
| Time Phase | Mid | 162 | 62.3% | 80.9% | 63.9% | 71.4% |
| Time Phase | Late | 162 | 97.5% | 100.0% | 96.9% | 98.4% |

View File

@@ -4,9 +4,6 @@ pandas>=2.0.0
numpy>=1.24.0 numpy>=1.24.0
flask>=3.0.0 flask>=3.0.0
scikit-learn>=1.3.0 scikit-learn>=1.3.0
jupyter>=1.0.0
matplotlib>=3.7.0
seaborn>=0.13.0
scipy>=1.10.0 scipy>=1.10.0
shap>=0.40.0
streamlit>=1.30.0 streamlit>=1.30.0
requests>=2.31.0

View File

@@ -1,90 +0,0 @@
import os
import sys
import pandas as pd
import numpy as np
import torch
import xgboost as xgb
from sklearn.metrics import accuracy_score, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
# Add project root to path
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from src.training.models import ClutchAttentionLSTM
from src.features.definitions import FEATURE_COLUMNS
from src.inference.stacking_ensemble import StackingEnsemble
# Configuration
XGB_MODEL_PATH = "models/clutch_model_v1.json"
LSTM_MODEL_PATH = "models/clutch_attention_lstm_v1.pth"
TEST_DATA_PATH = "data/processed/test_set.parquet"
def analyze_ensemble():
if not os.path.exists(TEST_DATA_PATH):
print("Test data not found.")
return
print(f"Loading data from {TEST_DATA_PATH}...")
df = pd.read_parquet(TEST_DATA_PATH)
y = df['round_winner'].values
# Initialize Ensemble (to reuse get_base_predictions)
device = "cuda" if torch.cuda.is_available() else "cpu"
ensemble = StackingEnsemble(XGB_MODEL_PATH, LSTM_MODEL_PATH, device)
print("Generating predictions...")
# Get base model predictions
meta_features = ensemble.get_base_predictions(df)
prob_xgb = meta_features['prob_xgb'].values
prob_lstm = meta_features['prob_lstm'].values
# 1. Correlation Analysis
correlation = np.corrcoef(prob_xgb, prob_lstm)[0, 1]
print(f"\n[Correlation Analysis]")
print(f"Correlation between XGBoost and LSTM predictions: {correlation:.4f}")
# 2. Performance Comparison (Log Loss & Accuracy)
acc_xgb = accuracy_score(y, (prob_xgb > 0.5).astype(int))
ll_xgb = log_loss(y, prob_xgb)
acc_lstm = accuracy_score(y, (prob_lstm > 0.5).astype(int))
ll_lstm = log_loss(y, prob_lstm)
print(f"\n[Performance Comparison]")
print(f"XGBoost - Acc: {acc_xgb:.2%}, LogLoss: {ll_xgb:.4f}")
print(f"LSTM - Acc: {acc_lstm:.2%}, LogLoss: {ll_lstm:.4f}")
# 3. Disagreement Analysis
# Where do they disagree?
pred_xgb = (prob_xgb > 0.5).astype(int)
pred_lstm = (prob_lstm > 0.5).astype(int)
disagreement_mask = pred_xgb != pred_lstm
disagreement_count = np.sum(disagreement_mask)
print(f"\n[Disagreement Analysis]")
print(f"Models disagree on {disagreement_count} / {len(df)} samples ({disagreement_count/len(df):.2%})")
if disagreement_count > 0:
# Who is right when they disagree?
disagreements = df[disagreement_mask].copy()
y_disagree = y[disagreement_mask]
pred_xgb_disagree = pred_xgb[disagreement_mask]
pred_lstm_disagree = pred_lstm[disagreement_mask]
xgb_correct = np.sum(pred_xgb_disagree == y_disagree)
lstm_correct = np.sum(pred_lstm_disagree == y_disagree)
print(f"In disagreement cases:")
print(f" XGBoost correct: {xgb_correct} times")
print(f" LSTM correct: {lstm_correct} times")
# Show a few examples
print("\nExample Disagreements:")
disagreements['prob_xgb'] = prob_xgb[disagreement_mask]
disagreements['prob_lstm'] = prob_lstm[disagreement_mask]
disagreements['actual'] = y_disagree
print(disagreements[['round', 'tick', 'prob_xgb', 'prob_lstm', 'actual']].head(5))
if __name__ == "__main__":
analyze_ensemble()

View File

@@ -1,95 +0,0 @@
import os
import sys
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
# Add project root to path
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from src.features.definitions import FEATURE_COLUMNS
# Configuration
MODEL_PATH = "models/clutch_model_v1.json"
TEST_DATA_PATH = "data/processed/test_set.parquet"
REPORT_DIR = "reports"
def explain_model():
# 1. Ensure Report Directory Exists
if not os.path.exists(REPORT_DIR):
os.makedirs(REPORT_DIR)
# 2. Load Data
if not os.path.exists(TEST_DATA_PATH):
print(f"Error: Test data not found at {TEST_DATA_PATH}")
return
print(f"Loading test data from {TEST_DATA_PATH}...")
df = pd.read_parquet(TEST_DATA_PATH)
X = df[FEATURE_COLUMNS]
y = df['round_winner']
# 3. Load Model
if not os.path.exists(MODEL_PATH):
print(f"Error: Model not found at {MODEL_PATH}")
return
print(f"Loading XGBoost model from {MODEL_PATH}...")
model = xgb.XGBClassifier()
model.load_model(MODEL_PATH)
# 4. Calculate SHAP Values
print("Calculating SHAP values (this may take a moment)...")
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
# 5. Generate Summary Plot
print(f"Generating SHAP Summary Plot...")
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X, show=False)
summary_plot_path = os.path.join(REPORT_DIR, "shap_summary_v1.png")
plt.savefig(summary_plot_path, bbox_inches='tight', dpi=300)
plt.close()
print(f"Saved summary plot to: {summary_plot_path}")
# 6. Generate Bar Plot (Global Feature Importance)
print(f"Generating SHAP Bar Plot...")
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X, plot_type="bar", show=False)
bar_plot_path = os.path.join(REPORT_DIR, "shap_importance_v1.png")
plt.savefig(bar_plot_path, bbox_inches='tight', dpi=300)
plt.close()
print(f"Saved importance plot to: {bar_plot_path}")
# 7. Interview Insights Generation
print("\n" + "="*50)
print(" DATA ANALYST INTERVIEW INSIGHTS ")
print("="*50)
# Calculate mean absolute SHAP values for importance
mean_abs_shap = np.mean(np.abs(shap_values), axis=0)
feature_importance = pd.DataFrame({
'feature': FEATURE_COLUMNS,
'importance': mean_abs_shap
}).sort_values('importance', ascending=False)
top_3 = feature_importance.head(3)
print("When an interviewer asks: 'What drives your model's predictions?'")
print("You can answer based on this data:")
print(f"1. The most critical factor is '{top_3.iloc[0]['feature']}'.")
print(f" (Impact Score: {top_3.iloc[0]['importance']:.4f})")
print(f"2. Followed by '{top_3.iloc[1]['feature']}' and '{top_3.iloc[2]['feature']}'.")
print("\nBusiness Interpretation:")
print("- If 'economy' features are top: The model confirms that money buys win rate.")
print("- If 'spatial' features are top: The model understands map control is key.")
print("- If 'status' (health/alive) features are top: The model relies on basic manpower advantage.")
print("-" * 50)
print(f"Check the visualizations in the '{REPORT_DIR}' folder to practice your storytelling.")
if __name__ == "__main__":
explain_model()

View File

@@ -1,126 +0,0 @@
import os
import sys
import pandas as pd
import xgboost as xgb
import shap
import numpy as np
# Add project root to path
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
# Define Model Path
MODEL_PATH = "models/clutch_model_v1.json"
def main():
# 1. Load Model
if not os.path.exists(MODEL_PATH):
print(f"Error: Model not found at {MODEL_PATH}")
return
model = xgb.XGBClassifier()
model.load_model(MODEL_PATH)
print("Model loaded successfully.")
# 2. Reconstruct the 2v2 Scenario Feature Vector
# This matches the output from test_advanced_inference.py
# "features_used": {
# "alive_diff": 0,
# "ct_alive": 2,
# "ct_area": 0.0,
# "ct_equip_value": 10050,
# "ct_health": 200,
# "ct_pincer_index": 4.850712408436715,
# "ct_spread": 2549.509756796392,
# "ct_total_cash": 9750,
# "game_time": 90.0,
# "health_diff": 0,
# "t_alive": 2,
# "t_area": 0.0,
# "t_equip_value": 7400,
# "t_health": 200,
# "t_pincer_index": 0.0951302970209441,
# "t_spread": 50.0,
# "t_total_cash": 3500,
# "team_distance": 525.594901040716
# }
feature_cols = [
't_alive', 'ct_alive', 't_health', 'ct_health',
'health_diff', 'alive_diff', 'game_time',
'team_distance', 't_spread', 'ct_spread', 't_area', 'ct_area',
't_pincer_index', 'ct_pincer_index',
't_total_cash', 'ct_total_cash', 't_equip_value', 'ct_equip_value',
'is_bomb_planted', 'site'
]
# Data from the previous test
data = {
't_alive': 2,
'ct_alive': 2,
't_health': 200,
'ct_health': 200,
'health_diff': 0,
'alive_diff': 0,
'game_time': 90.0,
'team_distance': 525.5949,
't_spread': 50.0,
'ct_spread': 2549.51,
't_area': 0.0,
'ct_area': 0.0,
't_pincer_index': 0.0951,
'ct_pincer_index': 4.8507,
't_total_cash': 3500,
'ct_total_cash': 9750,
't_equip_value': 7400,
'ct_equip_value': 10050,
'is_bomb_planted': 1,
'site': 401
}
df = pd.DataFrame([data], columns=feature_cols)
# 3. Predict
prob_ct = model.predict_proba(df)[0][1]
print(f"\nScenario Prediction:")
print(f"T Win Probability: {1-prob_ct:.4f}")
print(f"CT Win Probability: {prob_ct:.4f}")
# 4. SHAP Explanation
print("\nCalculating SHAP values...")
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df)
# Expected value (base rate)
base_value = explainer.expected_value
# If base_value is log-odds, we convert to prob for display, but SHAP values sum to margin.
# For binary classification, shap_values are usually in log-odds space.
print(f"Base Value (Log Odds): {base_value:.4f}")
# Create a DataFrame for results
# shap_values is (1, n_features)
results = pd.DataFrame({
'Feature': feature_cols,
'Value': df.iloc[0].values,
'SHAP Impact': shap_values[0]
})
# Sort by absolute impact
results['Abs Impact'] = results['SHAP Impact'].abs()
results = results.sort_values(by='Abs Impact', ascending=False)
print("\nFeature Attribution (Why did the model predict this?):")
print("-" * 80)
print(f"{'Feature':<20} | {'Value':<15} | {'SHAP Impact':<15} | {'Effect'}")
print("-" * 80)
for _, row in results.iterrows():
effect = "T Favored" if row['SHAP Impact'] < 0 else "CT Favored"
print(f"{row['Feature']:<20} | {row['Value']:<15.4f} | {row['SHAP Impact']:<15.4f} | {effect}")
print("-" * 80)
print("Note: Negative SHAP values push probability towards Class 0 (T Win).")
print(" Positive SHAP values push probability towards Class 1 (CT Win).")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,221 @@
"""
Clutch-IQ Comprehensive Metrics Report
======================================
Generates scenario breakdowns for the active XGBoost model (v2).
Metrics include:
- Accuracy, Precision, Recall, F1 (Overall)
- Breakdown by Scenario:
- Player Advantage (CT > T, T > CT, Equal)
- Bomb Status (Planted vs Not Planted)
- Game Phase (Early, Mid, Late) (by quantiles of game_time)
Usage:
python src/analysis/scenario_accuracy.py
"""
import os
import sys
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Add project root to path
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from src.features.definitions import FEATURE_COLUMNS
# Configuration
XGB_MODEL_PATH = os.getenv(
"CLUTCH_XGB_MODEL_PATH",
"models/clutch_model_v2.json"
)
TEST_DATA_PATH = os.getenv(
"CLUTCH_TEST_DATA_PATH",
"data/processed/test_set_v2.parquet"
)
REPORT_PATH = os.getenv(
"CLUTCH_REPORT_PATH",
"reports/v2_场景准召_指标报告.md"
)
def _calculate_metrics(y_true, y_pred):
return {
"Acc": accuracy_score(y_true, y_pred),
"Prec": precision_score(y_true, y_pred, pos_label=1, zero_division=0),
"Rec": recall_score(y_true, y_pred, pos_label=1, zero_division=0),
"F1": f1_score(y_true, y_pred, pos_label=1, zero_division=0),
}
def get_scenario_label(row):
# 1. Player Count
if row['ct_alive'] > row['t_alive']:
p_status = "Advantage (CT > T)"
elif row['ct_alive'] < row['t_alive']:
p_status = "Disadvantage (CT < T)"
else:
p_status = "Equal (CT == T)"
# 2. Bomb
b_status = "Planted" if row['is_bomb_planted'] == 1 else "Not Planted"
return p_status, b_status
def _build_time_phase(df):
if "game_time" not in df.columns or df["game_time"].dropna().empty:
return None
qs = df["game_time"].quantile([1/3, 2/3]).values.tolist()
q1, q2 = float(qs[0]), float(qs[1])
def _label(x):
if x <= q1:
return "Early"
if x <= q2:
return "Mid"
return "Late"
return df["game_time"].apply(_label)
def _add_results(results, category, subcategory, y_true, y_pred):
m = _calculate_metrics(y_true, y_pred)
results.append({
"Category": category,
"Subcategory": subcategory,
"Count": int(len(y_true)),
"Acc": m["Acc"],
"Prec": m["Prec"],
"Rec": m["Rec"],
"F1": m["F1"],
})
def _to_pct(x):
try:
return f"{float(x) * 100:.1f}%"
except Exception:
return ""
def save_markdown_report(res_df, model_path, test_data_path, frames, output_path):
out_dir = os.path.dirname(output_path)
if out_dir:
os.makedirs(out_dir, exist_ok=True)
title = "# v2 模型:不同场景下的指标报告\n\n"
meta = (
f"- 模型文件:`{model_path}`\n"
f"- 测试数据:`{test_data_path}`\n"
f"- 帧数:`{frames}`\n\n"
"说明:下表的 Precision/Recall/F1 均以 CT 胜利(=1) 作为正类。\n\n"
)
header = "| 类别 | 子类别 | 样本数 | 准确率(Acc) | 精确率(Prec) | 召回率(Rec) | F1 |\n"
sep = "|---|---:|---:|---:|---:|---:|---:|\n"
lines = [title, meta, header, sep]
for _, row in res_df.iterrows():
lines.append(
"| "
f"{row['Category']} | {row['Subcategory']} | {int(row['Count'])} | "
f"{_to_pct(row['Acc'])} | {_to_pct(row['Prec'])} | {_to_pct(row['Rec'])} | {_to_pct(row['F1'])} |\n"
)
with open(output_path, "w", encoding="utf-8") as f:
f.writelines(lines)
print(f"[Report] Markdown report saved to {output_path}")
def main():
if not os.path.exists(TEST_DATA_PATH):
print(f"Error: Test data not found at {TEST_DATA_PATH}")
return
if not os.path.exists(XGB_MODEL_PATH):
print(f"Error: Model not found at {XGB_MODEL_PATH}")
return
df = pd.read_parquet(TEST_DATA_PATH)
if "round_winner" not in df.columns:
print("Error: round_winner column not found in test data")
return
y_true = df["round_winner"].astype(int).values
model = xgb.XGBClassifier()
model.load_model(XGB_MODEL_PATH)
xgb_feature_names = model.get_booster().feature_names or list(FEATURE_COLUMNS)
X = df[xgb_feature_names]
probs = model.predict_proba(X)[:, 1]
y_pred = (probs >= 0.5).astype(int)
df_eval = df.copy()
df_eval["y_true"] = y_true
df_eval["y_pred"] = y_pred
df_eval["Player_Scenario"], df_eval["Bomb_Scenario"] = zip(*df_eval.apply(get_scenario_label, axis=1))
time_phase = _build_time_phase(df_eval)
if time_phase is not None:
df_eval["Time_Phase"] = time_phase
results = []
_add_results(results, "Overall", "All Data", df_eval["y_true"].values, df_eval["y_pred"].values)
for label, name in [(0, "T Win (Terrorist)"), (1, "CT Win (Counter-Terrorist)")]:
subset = df_eval[df_eval["y_true"] == label]
if not subset.empty:
_add_results(results, "Side", name, subset["y_true"].values, subset["y_pred"].values)
for sc in sorted(df_eval["Player_Scenario"].unique()):
subset = df_eval[df_eval["Player_Scenario"] == sc]
_add_results(results, "Player Count", sc, subset["y_true"].values, subset["y_pred"].values)
for sc in sorted(df_eval["Bomb_Scenario"].unique()):
subset = df_eval[df_eval["Bomb_Scenario"] == sc]
_add_results(results, "Bomb Status", sc, subset["y_true"].values, subset["y_pred"].values)
if "site" in df_eval.columns:
planted = df_eval[df_eval["is_bomb_planted"] == 1].copy() if "is_bomb_planted" in df_eval.columns else df_eval.iloc[0:0]
if not planted.empty:
planted["Site_Label"] = planted["site"].apply(lambda x: "B" if int(x) == 1 else "A")
for sc in sorted(planted["Site_Label"].unique()):
subset = planted[planted["Site_Label"] == sc]
_add_results(results, "Bomb Site", sc, subset["y_true"].values, subset["y_pred"].values)
if "ct_has_defuser" in df_eval.columns:
for sc in [0, 1]:
subset = df_eval[df_eval["ct_has_defuser"].astype(int) == sc]
_add_results(results, "CT Defuser", "Has Kit" if sc == 1 else "No Kit", subset["y_true"].values, subset["y_pred"].values)
if "t_blinded_count" in df_eval.columns and "ct_blinded_count" in df_eval.columns:
any_flash = (df_eval["t_blinded_count"].astype(float) + df_eval["ct_blinded_count"].astype(float)) > 0
subset = df_eval[~any_flash]
_add_results(results, "Flash", "No Flash", subset["y_true"].values, subset["y_pred"].values)
subset = df_eval[any_flash]
_add_results(results, "Flash", "Any Flash", subset["y_true"].values, subset["y_pred"].values)
if "Time_Phase" in df_eval.columns:
for sc in ["Early", "Mid", "Late"]:
subset = df_eval[df_eval["Time_Phase"] == sc]
if not subset.empty:
_add_results(results, "Time Phase", sc, subset["y_true"].values, subset["y_pred"].values)
res_df = pd.DataFrame(results)
print(f"Model: {XGB_MODEL_PATH}")
print(f"Test Data: {TEST_DATA_PATH}")
print(f"Frames: {len(df_eval)}")
print()
print("| Category | Subcategory | Count | Acc | Prec | Rec | F1 |")
print("|---|---|---:|---:|---:|---:|---:|")
for _, row in res_df.iterrows():
print(
f"| {row['Category']} | {row['Subcategory']} | {int(row['Count'])} | "
f"{row['Acc']:.1%} | {row['Prec']:.1%} | {row['Rec']:.1%} | {row['F1']:.1%} |"
)
save_markdown_report(
res_df=res_df,
model_path=XGB_MODEL_PATH,
test_data_path=TEST_DATA_PATH,
frames=len(df_eval),
output_path=REPORT_PATH,
)
if __name__ == "__main__":
main()

View File

@@ -53,26 +53,22 @@ def process_file(filepath):
logging.info(f"Processing new file: {filepath}") logging.info(f"Processing new file: {filepath}")
# We use subprocess to isolate memory usage and ensure clean state per file # We use subprocess to isolate memory usage and ensure clean state per file
# Updated: Now using --file argument for direct processing
cmd = [ cmd = [
sys.executable, sys.executable,
EXTRACT_SCRIPT, EXTRACT_SCRIPT,
"--demo_dir", os.path.dirname(filepath), # Temporarily point to where the file is "--file", filepath,
"--output_dir", OUTPUT_DIR, "--output_dir", OUTPUT_DIR,
"--delete-source" # Critical flag! "--delete-source" # Critical flag!
] ]
try: try:
# Note: extract_snapshots.py currently scans the whole dir. # Note: extract_snapshots.py now supports single file processing via --file.
# This is inefficient if we monitor a busy Downloads folder. # This allows us to process files directly from any location without moving them first,
# Ideally we should pass the specific file path. # or process them efficiently in the staging area.
# But for now, since we only care about .dem files and we delete them, it's okay.
# However, to avoid processing other .dem files in Downloads that user might want to keep,
# we should probably move it to a temp folder first?
# Or better: Update extract_snapshots.py to accept a single file.
# For safety in "Downloads" folder scenario: # However, for consistency and to avoid locking files in Downloads folder,
# 1. Move file to data/demos (staging area) # we still recommend moving to staging area first.
# 2. Process it there
staging_dir = os.path.abspath("data/demos") staging_dir = os.path.abspath("data/demos")
if not os.path.exists(staging_dir): if not os.path.exists(staging_dir):
@@ -85,6 +81,8 @@ def process_file(filepath):
if os.path.dirname(filepath) != staging_dir: if os.path.dirname(filepath) != staging_dir:
logging.info(f"Moving {filename} to staging area...") logging.info(f"Moving {filename} to staging area...")
try: try:
# Use move with retry/check?
# For now simple rename
os.rename(filepath, staged_path) os.rename(filepath, staged_path)
except OSError as e: except OSError as e:
logging.error(f"Failed to move file: {e}") logging.error(f"Failed to move file: {e}")
@@ -92,11 +90,11 @@ def process_file(filepath):
else: else:
staged_path = filepath staged_path = filepath
# Now process from staging # Now process from staging using the specific file path
cmd = [ cmd = [
sys.executable, sys.executable,
EXTRACT_SCRIPT, EXTRACT_SCRIPT,
"--demo_dir", staging_dir, "--file", staged_path,
"--output_dir", OUTPUT_DIR, "--output_dir", OUTPUT_DIR,
"--delete-source" "--delete-source"
] ]
@@ -106,8 +104,11 @@ def process_file(filepath):
if result.returncode == 0: if result.returncode == 0:
logging.info(f"Successfully processed batch.") logging.info(f"Successfully processed batch.")
logging.info(result.stdout) logging.info(result.stdout)
if result.stderr:
logging.info(f"Logs:\n{result.stderr}")
else: else:
logging.error(f"Processing failed with code {result.returncode}") logging.error(f"Processing failed with code {result.returncode}")
logging.error(result.stdout)
logging.error(result.stderr) logging.error(result.stderr)
except Exception as e: except Exception as e:

View File

@@ -17,6 +17,7 @@ import pandas as pd
import numpy as np import numpy as np
from demoparser2 import DemoParser # 核心依赖 from demoparser2 import DemoParser # 核心依赖
import logging import logging
import sys
# ============================================================================== # ==============================================================================
# ⚙️ 配置与调优参数 (可修改参数区) # ⚙️ 配置与调优参数 (可修改参数区)
@@ -321,10 +322,15 @@ def process_demo(demo_path, output_dir, delete_source=False):
except Exception as e: except Exception as e:
logging.error(f"处理失败 {demo_name}: {str(e)}") logging.error(f"处理失败 {demo_name}: {str(e)}")
# 如果是 Source 1 错误,给予明确提示
if "Source1" in str(e):
logging.error("❌ 这是一个 CS:GO (Source 1) 的 Demo本系统仅支持 CS2 (Source 2) Demo。")
sys.exit(1)
def main(): def main():
parser = argparse.ArgumentParser(description="L1B 快照引擎") parser = argparse.ArgumentParser(description="L1B 快照引擎")
parser.add_argument('--demo_dir', type=str, default='data/demos', help='输入 .dem 文件的目录') parser.add_argument('--demo_dir', type=str, default='data/demos', help='输入 .dem 文件的目录')
parser.add_argument('--file', type=str, help='处理单个 .dem 文件 (如果指定此参数,将忽略 --demo_dir)')
parser.add_argument('--output_dir', type=str, default='data/processed', help='输出 .parquet 文件的目录') parser.add_argument('--output_dir', type=str, default='data/processed', help='输出 .parquet 文件的目录')
parser.add_argument('--delete-source', action='store_true', help='处理成功后删除源文件') parser.add_argument('--delete-source', action='store_true', help='处理成功后删除源文件')
args = parser.parse_args() args = parser.parse_args()
@@ -332,7 +338,22 @@ def main():
if not os.path.exists(args.output_dir): if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir) os.makedirs(args.output_dir)
# 获取 demo 列表 # 模式 1: 单文件处理
if args.file:
if not os.path.exists(args.file):
logging.error(f"文件不存在: {args.file}")
return
if not args.file.endswith('.dem'):
logging.error(f"无效的文件扩展名: {args.file}")
return
process_demo(args.file, args.output_dir, delete_source=args.delete_source)
return
# 模式 2: 目录批处理
if not os.path.exists(args.demo_dir):
logging.warning(f"目录不存在: {args.demo_dir}")
return
demo_files = [os.path.join(args.demo_dir, f) for f in os.listdir(args.demo_dir) if f.endswith('.dem')] demo_files = [os.path.join(args.demo_dir, f) for f in os.listdir(args.demo_dir) if f.endswith('.dem')]
if not demo_files: if not demo_files:

View File

@@ -5,7 +5,6 @@ This module defines the canonical list of features used in the Clutch-IQ model.
Centralizing these definitions ensures consistency between training (train.py) and inference (app.py). Centralizing these definitions ensures consistency between training (train.py) and inference (app.py).
""" """
# 1. Status Features (Basic survival status)
STATUS_FEATURES = [ STATUS_FEATURES = [
't_alive', 't_alive',
'ct_alive', 'ct_alive',
@@ -15,7 +14,6 @@ STATUS_FEATURES = [
'alive_diff' 'alive_diff'
] ]
# 2. Economy & Equipment Features (Combat power)
ECONOMY_FEATURES = [ ECONOMY_FEATURES = [
't_total_cash', 't_total_cash',
'ct_total_cash', 'ct_total_cash',
@@ -23,7 +21,6 @@ ECONOMY_FEATURES = [
'ct_equip_value' 'ct_equip_value'
] ]
# 3. Spatial & Tactical Features (Map control)
SPATIAL_FEATURES = [ SPATIAL_FEATURES = [
'team_distance', 'team_distance',
't_spread', 't_spread',
@@ -34,14 +31,12 @@ SPATIAL_FEATURES = [
'ct_pincer_index' 'ct_pincer_index'
] ]
# 4. Context Features (Match situation)
CONTEXT_FEATURES = [ CONTEXT_FEATURES = [
'is_bomb_planted', 'is_bomb_planted',
'site', 'site',
'game_time' 'game_time'
] ]
# 5. Player Capability Features (Individual skill/experience)
PLAYER_FEATURES = [ PLAYER_FEATURES = [
't_player_experience', 't_player_experience',
'ct_player_experience', 'ct_player_experience',
@@ -49,29 +44,24 @@ PLAYER_FEATURES = [
'ct_player_rating' 'ct_player_rating'
] ]
# Master list of all features used for model training and inference UTILITY_FEATURES = [
# ORDER MATTERS: This order must be preserved to match the trained model artifact. 't_blinded_count',
FEATURE_COLUMNS = ( 'ct_blinded_count',
STATUS_FEATURES + 't_blind_time_sum',
[CONTEXT_FEATURES[2]] + # game_time is usually placed here in the legacy order, let's check 'ct_blind_time_sum',
SPATIAL_FEATURES + 'blinded_diff'
ECONOMY_FEATURES + ]
CONTEXT_FEATURES[0:2] + # is_bomb_planted, site
PLAYER_FEATURES
)
# Re-defining specifically to match the EXACT order from the original code to avoid breaking the model GEAR_FEATURES = [
# Original order: 'ct_has_defuser',
# 't_alive', 'ct_alive', 't_health', 'ct_health', 'ct_defuser_count',
# 'health_diff', 'alive_diff', 'game_time', 't_helmet_count',
# 'team_distance', 't_spread', 'ct_spread', 't_area', 'ct_area', 'ct_helmet_count',
# 't_pincer_index', 'ct_pincer_index', 't_armor_sum',
# 't_total_cash', 'ct_total_cash', 't_equip_value', 'ct_equip_value', 'ct_armor_sum'
# 'is_bomb_planted', 'site', ]
# 't_player_experience', 'ct_player_experience',
# 't_player_rating', 'ct_player_rating'
FEATURE_COLUMNS = [ FEATURE_COLUMNS_V1 = [
't_alive', 'ct_alive', 't_health', 'ct_health', 't_alive', 'ct_alive', 't_health', 'ct_health',
'health_diff', 'alive_diff', 'game_time', 'health_diff', 'alive_diff', 'game_time',
'team_distance', 't_spread', 'ct_spread', 't_area', 'ct_area', 'team_distance', 't_spread', 'ct_spread', 't_area', 'ct_area',
@@ -81,3 +71,10 @@ FEATURE_COLUMNS = [
't_player_experience', 'ct_player_experience', 't_player_experience', 'ct_player_experience',
't_player_rating', 'ct_player_rating' 't_player_rating', 'ct_player_rating'
] ]
FEATURE_COLUMNS_V2 = FEATURE_COLUMNS_V1 + UTILITY_FEATURES + GEAR_FEATURES
FEATURE_COLUMNS = FEATURE_COLUMNS_V2
XGB_FEATURE_COLUMNS_V1 = FEATURE_COLUMNS_V1
XGB_FEATURE_COLUMNS_V2 = FEATURE_COLUMNS_V2

View File

@@ -18,7 +18,7 @@ from flask import Flask, request, jsonify, Response
sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from src.features.spatial import calculate_spatial_features from src.features.spatial import calculate_spatial_features
from src.features.economy import calculate_economy_features from src.features.economy import calculate_economy_features
from src.features.definitions import FEATURE_COLUMNS from src.features.definitions import FEATURE_COLUMNS, FEATURE_COLUMNS_V1, FEATURE_COLUMNS_V2
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
@@ -30,11 +30,12 @@ logging.basicConfig(
app = Flask(__name__) app = Flask(__name__)
# Load Model # Load Model
MODEL_PATH = "models/clutch_model_v1.json" MODEL_PATH = "models/clutch_model_v2.json"
PLAYER_EXPERIENCE_PATH = "models/player_experience.json" PLAYER_EXPERIENCE_PATH = "models/player_experience.json"
L3_DB_PATH = "database/L3/L3.db" L3_DB_PATH = "database/L3/L3.db"
L2_DB_PATH = "database/L2/L2.db" L2_DB_PATH = "database/L2/L2.db"
model = None model = None
active_feature_columns = FEATURE_COLUMNS_V2
player_experience_map = {} player_experience_map = {}
player_rating_map = {} player_rating_map = {}
last_gsi_result = None last_gsi_result = None
@@ -160,16 +161,26 @@ def gsi_to_payload(gsi):
} }
def load_model(): def load_model():
global model global model, active_feature_columns
if os.path.exists(MODEL_PATH): model_path = os.getenv("CLUTCH_MODEL_PATH")
if not model_path:
model_version = os.getenv("CLUTCH_MODEL_VERSION", "v2").strip().lower()
model_path = "models/clutch_model_v2.json" if model_version == "v2" else MODEL_PATH
if os.path.exists(model_path):
try: try:
model = xgb.XGBClassifier() model = xgb.XGBClassifier()
model.load_model(MODEL_PATH) model.load_model(model_path)
logging.info(f"Model loaded successfully from {MODEL_PATH}") booster_feature_names = model.get_booster().feature_names
if booster_feature_names:
active_feature_columns = list(booster_feature_names)
else:
active_feature_columns = FEATURE_COLUMNS_V2
logging.info(f"Model loaded successfully from {model_path} (features={len(active_feature_columns)})")
except Exception as e: except Exception as e:
logging.error(f"Failed to load model: {e}") logging.error(f"Failed to load model: {e}")
else: else:
logging.error(f"Model file not found at {MODEL_PATH}") logging.error(f"Model file not found at {model_path}")
def load_player_experience(): def load_player_experience():
global player_experience_map global player_experience_map
@@ -242,7 +253,7 @@ def process_payload(payload):
payload['alive_diff'] = payload.get('ct_alive', 0) - payload.get('t_alive', 0) payload['alive_diff'] = payload.get('ct_alive', 0) - payload.get('t_alive', 0)
# Ensure order matches training # Ensure order matches training
cols = FEATURE_COLUMNS cols = active_feature_columns
# Create single-row DataFrame # Create single-row DataFrame
data = {k: [payload.get(k, 0)] for k in cols} data = {k: [payload.get(k, 0)] for k in cols}
@@ -357,33 +368,53 @@ def process_payload(payload):
t_equip_value = 0 t_equip_value = 0
ct_equip_value = 0 ct_equip_value = 0
# Construct feature vector ct_defuser_count = int(df[(df['team_num'] == 3) & (df['is_alive']) & (df['has_defuser'] == True)].shape[0])
# Order MUST match train.py feature_cols ct_has_defuser = 1 if ct_defuser_count > 0 else 0
# ['t_alive', 'ct_alive', 't_health', 'ct_health', 'health_diff', 'alive_diff', 'game_time', t_helmet_count = int(df[(df['team_num'] == 2) & (df['is_alive']) & (df['has_helmet'] == True)].shape[0])
# 'team_distance', 't_spread', 'ct_spread', 't_area', 'ct_area', 't_pincer_index', 'ct_pincer_index', ct_helmet_count = int(df[(df['team_num'] == 3) & (df['is_alive']) & (df['has_helmet'] == True)].shape[0])
# 't_total_cash', 'ct_total_cash', 't_equip_value', 'ct_equip_value', 'is_bomb_planted', 'site'] t_armor_sum = float(df[(df['team_num'] == 2) & (df['is_alive'])]['armor_value'].sum())
ct_armor_sum = float(df[(df['team_num'] == 3) & (df['is_alive'])]['armor_value'].sum())
features = [
t_alive, ct_alive, t_health, ct_health, feature_values = {
health_diff, alive_diff, game_time, 't_alive': t_alive,
team_distance, t_spread, ct_spread, t_area, ct_area, 'ct_alive': ct_alive,
t_pincer_index, ct_pincer_index, 't_health': t_health,
t_total_cash, ct_total_cash, t_equip_value, ct_equip_value, 'ct_health': ct_health,
is_bomb_planted, site, 'health_diff': health_diff,
t_player_experience, ct_player_experience, 'alive_diff': alive_diff,
t_player_rating, ct_player_rating 'game_time': game_time,
] 'team_distance': team_distance,
't_spread': t_spread,
return pd.DataFrame([features], columns=[ 'ct_spread': ct_spread,
't_alive', 'ct_alive', 't_health', 'ct_health', 't_area': t_area,
'health_diff', 'alive_diff', 'game_time', 'ct_area': ct_area,
'team_distance', 't_spread', 'ct_spread', 't_area', 'ct_area', 't_pincer_index': t_pincer_index,
't_pincer_index', 'ct_pincer_index', 'ct_pincer_index': ct_pincer_index,
't_total_cash', 'ct_total_cash', 't_equip_value', 'ct_equip_value', 't_total_cash': t_total_cash,
'is_bomb_planted', 'site', 'ct_total_cash': ct_total_cash,
't_player_experience', 'ct_player_experience', 't_equip_value': t_equip_value,
't_player_rating', 'ct_player_rating' 'ct_equip_value': ct_equip_value,
]) 'is_bomb_planted': is_bomb_planted,
'site': site,
't_player_experience': t_player_experience,
'ct_player_experience': ct_player_experience,
't_player_rating': t_player_rating,
'ct_player_rating': ct_player_rating,
't_blinded_count': 0,
'ct_blinded_count': 0,
't_blind_time_sum': 0.0,
'ct_blind_time_sum': 0.0,
'blinded_diff': 0,
'ct_has_defuser': ct_has_defuser,
'ct_defuser_count': ct_defuser_count,
't_helmet_count': t_helmet_count,
'ct_helmet_count': ct_helmet_count,
't_armor_sum': t_armor_sum,
'ct_armor_sum': ct_armor_sum
}
cols = active_feature_columns
return pd.DataFrame([{k: feature_values.get(k, 0) for k in cols}], columns=cols)
except Exception as e: except Exception as e:
logging.error(f"Error processing payload: {e}") logging.error(f"Error processing payload: {e}")

View File

@@ -1,182 +0,0 @@
"""
Ensemble Framework: XGBoost + LSTM Fusion
=========================================
This script demonstrates the framework for combining the static state analysis of XGBoost
with the temporal trend analysis of LSTM to produce a robust final prediction.
Methodology:
1. Load both trained models (XGBoost .json, LSTM .pth).
2. Prepare input data:
- XGBoost: Takes single frame features (24 dims).
- LSTM: Takes sequence of last 10 frames (10x24 dims).
3. Generate independent probabilities: P_xgb, P_lstm.
4. Fuse predictions using Weighted Averaging:
P_final = alpha * P_xgb + (1 - alpha) * P_lstm
5. Evaluate performance on the test set.
Usage:
python src/inference/ensemble_framework.py
"""
import os
import sys
import numpy as np
import pandas as pd
import torch
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, log_loss
# Ensure imports work
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from src.training.models import ClutchLSTM
from src.training.sequence_prep import create_sequences
from src.features.definitions import FEATURE_COLUMNS
# Configuration
XGB_MODEL_PATH = "models/clutch_v1.model.json"
LSTM_MODEL_PATH = "models/clutch_lstm_v1.pth"
TEST_DATA_PATH = "data/processed/test_set.parquet" # The test set saved by train.py
# Fusion Hyperparameter
# 0.6 means we trust XGBoost slightly more (currently it has higher accuracy ~84% vs LSTM ~77%)
ALPHA = 0.6
class ClutchEnsemble:
def __init__(self, xgb_path, lstm_path, device='cpu'):
self.device = device
# Load XGBoost
print(f"Loading XGBoost from {xgb_path}...")
self.xgb_model = xgb.XGBClassifier()
self.xgb_model.load_model(xgb_path)
# Load LSTM
print(f"Loading LSTM from {lstm_path}...")
# Need to know input_dim (24 features)
self.lstm_model = ClutchLSTM(input_dim=len(FEATURE_COLUMNS)).to(device)
self.lstm_model.load_state_dict(torch.load(lstm_path, map_location=device))
self.lstm_model.eval()
def predict(self, df):
"""
End-to-end prediction on a dataframe.
Note: This handles the complexity of alignment.
LSTM needs 10 frames, so the first 9 frames of each match cannot have LSTM predictions.
Strategy: Fallback to XGBoost for the first 9 frames.
"""
# 1. XGBoost Predictions (Fast, parallel)
print("Generating XGBoost predictions...")
X_xgb = df[FEATURE_COLUMNS]
# predict_proba returns [prob_0, prob_1], we want prob_1 (CT Win?)
# Wait, check mapping. train.py: T=0, CT=1. So index 1 is CT win probability.
# But wait, XGBoost might have different class order if not careful.
# Usually classes_ is [0, 1].
probs_xgb = self.xgb_model.predict_proba(X_xgb)[:, 1]
# 2. LSTM Predictions (Sequential)
print("Generating LSTM predictions...")
# We need to create sequences.
# Ideally we reuse sequence_prep logic but we need to keep the index aligned with df.
# Initialize with NaN or fallback
probs_lstm = np.full(len(df), np.nan)
# Group by match to avoid cross-match leakage in sequence creation
# We need to iterate and fill `probs_lstm`
# For efficiency, let's extract sequences using the helper, but we need to know WHICH rows they correspond to.
# create_sequences in sequence_prep.py returns arrays, stripping index.
# Let's write a custom generator here that preserves alignment.
seq_len = 10
inputs_list = []
indices_list = []
grouped = df.groupby(['match_id', 'round'])
for (match_id, round_num), group in grouped:
group = group.sort_values('tick')
data = group[FEATURE_COLUMNS].values
if len(data) < seq_len:
continue
for i in range(len(data) - seq_len + 1):
# Sequence ends at index i + seq_len - 1
# This index corresponds to the row we are predicting for
row_idx = group.index[i + seq_len - 1]
seq = data[i : i + seq_len]
inputs_list.append(seq)
indices_list.append(row_idx)
if len(inputs_list) > 0:
inputs_tensor = torch.FloatTensor(np.array(inputs_list)).to(self.device)
with torch.no_grad():
# Batch prediction
# Depending on RAM, might need mini-batches. For <10k rows, full batch is fine.
outputs = self.lstm_model(inputs_tensor)
# outputs is [batch, 1] (Sigmoid)
lstm_preds = outputs.cpu().numpy().flatten()
# Fill the array
probs_lstm[indices_list] = lstm_preds
# 3. Fusion
print("Fusing predictions...")
final_probs = []
for p_x, p_l in zip(probs_xgb, probs_lstm):
if np.isnan(p_l):
# Fallback to XGBoost if insufficient history (start of round)
final_probs.append(p_x)
else:
# Weighted Average
p_final = ALPHA * p_x + (1 - ALPHA) * p_l
final_probs.append(p_final)
return np.array(final_probs)
def main():
if not os.path.exists(TEST_DATA_PATH):
print(f"Test set not found at {TEST_DATA_PATH}. Please run training first.")
return
print(f"Loading test set from {TEST_DATA_PATH}...")
df_test = pd.read_parquet(TEST_DATA_PATH)
# Ground Truth
y_true = df_test['round_winner'].map({'T': 0, 'CT': 1}).values
# Initialize Ensemble
device = "cuda" if torch.cuda.is_available() else "cpu"
ensemble = ClutchEnsemble(XGB_MODEL_PATH, LSTM_MODEL_PATH, device)
# Predict
y_prob = ensemble.predict(df_test)
y_pred = (y_prob > 0.5).astype(int)
# Evaluate
acc = accuracy_score(y_true, y_pred)
ll = log_loss(y_true, y_prob)
print("\n" + "="*50)
print(" ENSEMBLE MODEL RESULTS ")
print("="*50)
print(f"🔥 Final Accuracy: {acc:.2%}")
print(f"📉 Log Loss: {ll:.4f}")
print("-" * 50)
print("Detailed Report:")
print(classification_report(y_true, y_pred, target_names=['T', 'CT']))
print("="*50)
# Compare with standalone XGBoost for reference
# (Since we have the loaded model, let's just check quickly)
print("\n[Reference] Standalone XGBoost Performance:")
y_prob_xgb = ensemble.xgb_model.predict_proba(df_test[FEATURE_COLUMNS])[:, 1]
y_pred_xgb = (y_prob_xgb > 0.5).astype(int)
print(f"XGB Accuracy: {accuracy_score(y_true, y_pred_xgb):.2%}")
if __name__ == "__main__":
main()

View File

@@ -1,217 +0,0 @@
"""
Stacking Ensemble Framework (Advanced Fusion)
=============================================
Beyond simple weighted averaging, this script implements 'Stacking' (Stacked Generalization).
It trains a Meta-Learner (Logistic Regression) to intelligently combine the predictions
of the base models (XGBoost + LSTM) based on the current context (e.g., game time).
Architecture:
1. Base Layer: XGBoost, LSTM
2. Meta Layer: Logistic Regression
Input: [Prob_XGB, Prob_LSTM, Game_Time, Team_Alive_Diff]
Output: Final Probability
Why this is better:
- It learns WHEN to trust which model (e.g., trust LSTM more in late-game).
- It can correct systematic biases of base models.
Usage:
python src/inference/stacking_ensemble.py
"""
import os
import sys
import numpy as np
import pandas as pd
import torch
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.model_selection import train_test_split
# Ensure imports work
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from src.training.models import ClutchAttentionLSTM
from src.features.definitions import FEATURE_COLUMNS
# Configuration
XGB_MODEL_PATH = "models/clutch_model_v1.json"
LSTM_MODEL_PATH = "models/clutch_attention_lstm_v1.pth"
TEST_DATA_PATH = "data/processed/test_set.parquet"
class StackingEnsemble:
def __init__(self, xgb_path, lstm_path, device='cpu'):
self.device = device
self.meta_learner = LogisticRegression()
self.is_fitted = False
# Load Base Models
print(f"Loading Base Model: XGBoost...")
self.xgb_model = xgb.XGBClassifier()
self.xgb_model.load_model(xgb_path)
print(f"Loading Base Model: LSTM...")
self.lstm_model = ClutchAttentionLSTM(input_dim=len(FEATURE_COLUMNS), hidden_dim=64, num_layers=2, dropout=0.5).to(device)
self.lstm_model.load_state_dict(torch.load(lstm_path, map_location=device))
self.lstm_model.eval()
def get_base_predictions(self, df):
"""Generates features for the meta-learner."""
# Reset index to ensure positional indexing works for probs_lstm
df = df.reset_index(drop=True)
# 1. XGBoost Probabilities
X_xgb = df[FEATURE_COLUMNS]
probs_xgb = self.xgb_model.predict_proba(X_xgb)[:, 1]
# 2. LSTM Probabilities
probs_lstm = np.full(len(df), np.nan)
seq_len = 10
inputs_list = []
indices_list = []
grouped = df.groupby(['match_id', 'round'])
for (match_id, round_num), group in grouped:
group = group.sort_values('tick')
data = group[FEATURE_COLUMNS].values
if len(data) < seq_len:
continue
for i in range(len(data) - seq_len + 1):
row_idx = group.index[i + seq_len - 1]
seq = data[i : i + seq_len]
inputs_list.append(seq)
indices_list.append(row_idx)
if len(inputs_list) > 0:
inputs_tensor = torch.FloatTensor(np.array(inputs_list)).to(self.device)
with torch.no_grad():
outputs = self.lstm_model(inputs_tensor)
lstm_preds = outputs.cpu().numpy().flatten()
probs_lstm[indices_list] = lstm_preds
# Handle NaNs (start of rounds) - Fill with XGBoost prediction (trust base state)
# This is a crucial imputation step for the meta-learner
mask_nan = np.isnan(probs_lstm)
probs_lstm[mask_nan] = probs_xgb[mask_nan]
# 3. Construct Meta-Features
# We add 'game_time' to let the meta-learner learn temporal weighting
# We add 'alive_diff' to let it know the complexity of the situation
meta_features = pd.DataFrame({
'prob_xgb': probs_xgb,
'prob_lstm': probs_lstm,
'game_time': df['game_time'].values,
'alive_diff': df['alive_diff'].values
})
return meta_features
def fit_meta_learner(self, df, y):
"""Train the Meta-Learner on a validation set."""
print("Generating meta-features for training...")
X_meta = self.get_base_predictions(df)
print("Training Meta-Learner (Logistic Regression)...")
self.meta_learner.fit(X_meta, y)
self.is_fitted = True
# Analyze learned weights
coefs = self.meta_learner.coef_[0]
print("\n[Meta-Learner Insights]")
print("How much does it trust each signal?")
print(f" Weight on XGBoost: {coefs[0]:.4f}")
print(f" Weight on LSTM: {coefs[1]:.4f}")
print(f" Weight on GameTime: {coefs[2]:.4f}")
print(f" Weight on AliveDiff: {coefs[3]:.4f}")
print("(Positive weight = positive correlation with CT winning)\n")
def predict(self, df):
if not self.is_fitted:
raise ValueError("Meta-learner not fitted! Call fit_meta_learner first.")
X_meta = self.get_base_predictions(df)
return self.meta_learner.predict_proba(X_meta)[:, 1]
def main():
if not os.path.exists(TEST_DATA_PATH):
print("Test data not found.")
return
print(f"Loading data from {TEST_DATA_PATH}...")
df = pd.read_parquet(TEST_DATA_PATH)
# Target Mapping
# Data is already 0/1, so no need to map 'T'/'CT'
y = df['round_winner'].values
# Split Data for Meta-Learning
# We need a 'Meta-Train' set to train the unifier, and 'Meta-Test' to evaluate it.
# Since we only have 2 matches in test_set, let's split by match_id if possible.
unique_matches = df['match_id'].unique()
if len(unique_matches) >= 2:
# Split 50/50 by match
mid = len(unique_matches) // 2
meta_train_matches = unique_matches[:mid]
meta_test_matches = unique_matches[mid:]
train_mask = df['match_id'].isin(meta_train_matches)
test_mask = df['match_id'].isin(meta_test_matches)
df_meta_train = df[train_mask]
y_meta_train = y[train_mask]
df_meta_test = df[test_mask]
y_meta_test = y[test_mask]
print(f"Split: Meta-Train ({len(df_meta_train)} rows) | Meta-Test ({len(df_meta_test)} rows)")
else:
print("Not enough matches for match-level split. Using random split (Caution: Leakage).")
df_meta_train, df_meta_test, y_meta_train, y_meta_test = train_test_split(df, y, test_size=0.5, random_state=42)
# Initialize Ensemble
device = "cuda" if torch.cuda.is_available() else "cpu"
stacking_model = StackingEnsemble(XGB_MODEL_PATH, LSTM_MODEL_PATH, device)
# 1. Train Meta-Learner
stacking_model.fit_meta_learner(df_meta_train, y_meta_train)
# 2. Evaluate on Held-out Meta-Test set
print("Evaluating Stacking Ensemble...")
y_prob = stacking_model.predict(df_meta_test)
y_pred = (y_prob > 0.5).astype(int)
acc = accuracy_score(y_meta_test, y_pred)
ll = log_loss(y_meta_test, y_prob)
print("\n" + "="*50)
print(" STACKING ENSEMBLE RESULTS ")
print("="*50)
print(f"Final Accuracy: {acc:.2%}")
print(f"Log Loss: {ll:.4f}")
print("-" * 50)
print(classification_report(y_meta_test, y_pred, target_names=['T', 'CT']))
# Baseline Comparison
print("="*50)
print("[Baselines on Meta-Test Set]")
# XGBoost Baseline
X_test_xgb = df_meta_test[FEATURE_COLUMNS]
y_pred_xgb = stacking_model.xgb_model.predict(X_test_xgb)
acc_xgb = accuracy_score(y_meta_test, y_pred_xgb)
print(f"XGBoost Only: {acc_xgb:.2%}")
# LSTM Baseline
meta_features_test = stacking_model.get_base_predictions(df_meta_test)
probs_lstm = meta_features_test['prob_lstm'].values
y_pred_lstm = (probs_lstm > 0.5).astype(int)
acc_lstm = accuracy_score(y_meta_test, y_pred_lstm)
print(f"LSTM Only: {acc_lstm:.2%}")
print("="*50)
if __name__ == "__main__":
main()

View File

@@ -21,8 +21,8 @@ from src.features.definitions import FEATURE_COLUMNS
# Configuration # Configuration
MODEL_DIR = "models" MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, "clutch_model_v1.json") MODEL_PATH = os.path.join(MODEL_DIR, "clutch_model_v2.json")
TEST_DATA_PATH = os.path.join("data", "processed", "test_set.parquet") TEST_DATA_PATH = os.path.join("data", "processed", "test_set_v2.parquet")
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
@@ -49,7 +49,8 @@ def evaluate_model():
model.load_model(MODEL_PATH) model.load_model(MODEL_PATH)
# 2. Prepare Features # 2. Prepare Features
X_test = df_test[FEATURE_COLUMNS] feature_names = model.get_booster().feature_names or list(FEATURE_COLUMNS)
X_test = df_test[feature_names]
y_test = df_test['round_winner'].astype(int) y_test = df_test['round_winner'].astype(int)
# 3. Predict # 3. Predict

View File

@@ -1,111 +0,0 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
class Attention(nn.Module):
def __init__(self, hidden_dim):
super(Attention, self).__init__()
self.hidden_dim = hidden_dim
self.attn = nn.Linear(hidden_dim, 1)
def forward(self, x):
# x shape: (batch, seq, hidden)
# Calculate attention scores
# scores shape: (batch, seq, 1)
scores = self.attn(x)
# Softmax over sequence dimension
# weights shape: (batch, seq, 1)
weights = F.softmax(scores, dim=1)
# Weighted sum
# context shape: (batch, hidden)
# element-wise multiplication broadcasted, then sum over seq
context = torch.sum(x * weights, dim=1)
return context, weights
class ClutchLSTM(nn.Module):
def __init__(self, input_dim, hidden_dim=64, num_layers=2, output_dim=1, dropout=0.2):
super(ClutchLSTM, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
# LSTM Layer
# batch_first=True means input shape is (batch, seq, feature)
self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers,
batch_first=True, dropout=dropout)
# Fully Connected Layer
self.fc = nn.Linear(hidden_dim, output_dim)
# Sigmoid activation for binary classification
self.sigmoid = nn.Sigmoid()
def forward(self, x):
# x shape: (batch, seq, feature)
# Initialize hidden state with zeros
# Using x.device ensures tensors are on the same device (CPU/GPU)
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
# Forward propagate LSTM
# out shape: (batch, seq, hidden_dim)
out, _ = self.lstm(x, (h0, c0))
# Decode the hidden state of the last time step
# out[:, -1, :] takes the last output of the sequence
out = self.fc(out[:, -1, :])
out = self.sigmoid(out)
return out
class ClutchAttentionLSTM(nn.Module):
def __init__(self, input_dim, hidden_dim=128, num_layers=2, output_dim=1, dropout=0.3):
super(ClutchAttentionLSTM, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
# 1. Input Layer Norm (Stabilizes training)
self.layer_norm = nn.LayerNorm(input_dim)
# 2. LSTM (Increased hidden_dim for capacity)
self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers,
batch_first=True, dropout=dropout, bidirectional=False)
# 3. Attention Mechanism
self.attention = Attention(hidden_dim)
# 4. Fully Connected Layers
self.fc1 = nn.Linear(hidden_dim, 64)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout)
self.fc2 = nn.Linear(64, output_dim)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
# x: (batch, seq, feature)
x = self.layer_norm(x)
# LSTM
# out: (batch, seq, hidden)
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
lstm_out, _ = self.lstm(x, (h0, c0))
# Attention
# context: (batch, hidden)
context, attn_weights = self.attention(lstm_out)
# MLP Head
out = self.fc1(context)
out = self.relu(out)
out = self.dropout(out)
out = self.fc2(out)
out = self.sigmoid(out)
return out

View File

@@ -1,114 +0,0 @@
"""
Sequence Data Preparation for LSTM/GRU Models
This script transforms the L2 frame-level data into L3 sequence data suitable for RNNs.
Output: (Batch_Size, Sequence_Length, Num_Features)
"""
import os
import sys
import numpy as np
import pandas as pd
import logging
import torch
from torch.utils.data import Dataset, DataLoader
# Ensure we can import from src
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from src.training.train import load_data, preprocess_features
from src.features.definitions import FEATURE_COLUMNS
# Configuration
SEQ_LEN = 10 # 10 frames * 2s/frame = 20 seconds of context
DATA_DIR = "data/processed"
OUTPUT_DIR = "data/sequences"
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class ClutchSequenceDataset(Dataset):
def __init__(self, sequences, targets):
self.sequences = torch.FloatTensor(sequences)
self.targets = torch.FloatTensor(targets)
def __len__(self):
return len(self.sequences)
def __getitem__(self, idx):
return self.sequences[idx], self.targets[idx]
def create_sequences(df, seq_len=10):
"""
Creates sliding window sequences from the dataframe.
Assumes df is already sorted by match, round, and time.
"""
sequences = []
targets = []
# Group by Match and Round to ensure we don't sequence across boundaries
# Using 'tick' to sort, assuming it increases with time
grouped = df.groupby(['match_id', 'round'])
match_ids = [] # To store match_id for each sequence
for (match_id, round_num), group in grouped:
group = group.sort_values('tick')
data = group[FEATURE_COLUMNS].values
labels = group['round_winner'].values # 0 for T, 1 for CT (need to verify mapping)
# Check if we have enough data for at least one sequence
if len(data) < seq_len:
continue
# Create sliding windows
for i in range(len(data) - seq_len + 1):
seq = data[i : i + seq_len]
label = labels[i + seq_len - 1] # Label of the last frame in sequence
sequences.append(seq)
targets.append(label)
match_ids.append(match_id)
return np.array(sequences), np.array(targets), np.array(match_ids)
def prepare_sequence_data(save=True):
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
logging.info("Loading raw data...")
raw_df = load_data(DATA_DIR)
logging.info("Preprocessing to frame-level features...")
df = preprocess_features(raw_df)
logging.info(f"Frame-level df shape: {df.shape}")
logging.info(f"Columns: {df.columns.tolist()}")
logging.info(f"Round Winner unique values: {df['round_winner'].unique()}")
# Ensure target is numeric
# Check if mapping is needed
if df['round_winner'].dtype == 'object':
logging.info("Mapping targets from T/CT to 0/1...")
df['round_winner'] = df['round_winner'].map({'T': 0, 'CT': 1})
logging.info(f"Round Winner unique values after mapping: {df['round_winner'].unique()}")
df = df.dropna(subset=['round_winner'])
logging.info(f"df shape after dropna: {df.shape}")
logging.info(f"Creating sequences (Length={SEQ_LEN})...")
X, y, matches = create_sequences(df, SEQ_LEN)
logging.info(f"Generated {len(X)} sequences.")
logging.info(f"Shape: {X.shape}")
if save:
logging.info(f"Saving to {OUTPUT_DIR}...")
np.save(os.path.join(OUTPUT_DIR, "X_seq.npy"), X)
np.save(os.path.join(OUTPUT_DIR, "y_seq.npy"), y)
np.save(os.path.join(OUTPUT_DIR, "matches_seq.npy"), matches)
return X, y, matches
if __name__ == "__main__":
prepare_sequence_data()

View File

@@ -29,12 +29,12 @@ import sqlite3
sys.path.append(os.path.join(os.path.dirname(__file__), '..')) sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from features.spatial import calculate_spatial_features from features.spatial import calculate_spatial_features
from features.economy import calculate_economy_features from features.economy import calculate_economy_features
from features.definitions import FEATURE_COLUMNS from features.definitions import FEATURE_COLUMNS, XGB_FEATURE_COLUMNS_V2
# Configuration # Configuration
DATA_DIR = "data/processed" DATA_DIR = "data/processed"
MODEL_DIR = "models" MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, "clutch_model_v1.json") MODEL_PATH = os.path.join(MODEL_DIR, "clutch_model_v2.json")
L3_DB_PATH = os.path.join("database", "L3", "L3.db") L3_DB_PATH = os.path.join("database", "L3", "L3.db")
L2_DB_PATH = os.path.join("database", "L2", "L2.db") L2_DB_PATH = os.path.join("database", "L2", "L2.db")
TEST_SIZE = 0.2 TEST_SIZE = 0.2
@@ -102,6 +102,36 @@ def preprocess_features(df):
df['t_health'] = df['is_t'] * df['health'] df['t_health'] = df['is_t'] * df['health']
df['ct_health'] = df['is_ct'] * df['health'] df['ct_health'] = df['is_ct'] * df['health']
if 'flash_duration' not in df.columns:
df['flash_duration'] = 0.0
df['flash_duration'] = pd.to_numeric(df['flash_duration'], errors='coerce').fillna(0.0).astype('float32')
if 'has_defuser' not in df.columns:
df['has_defuser'] = 0
df['has_defuser'] = df['has_defuser'].fillna(0).astype(int)
if 'has_helmet' not in df.columns:
df['has_helmet'] = 0
df['has_helmet'] = df['has_helmet'].fillna(0).astype(int)
if 'armor_value' not in df.columns:
df['armor_value'] = 0
df['armor_value'] = pd.to_numeric(df['armor_value'], errors='coerce').fillna(0.0).astype('float32')
is_alive_int = df['is_alive'].astype(int)
is_blinded = ((df['flash_duration'] > 0).astype(int) * is_alive_int).astype(int)
df['t_blinded_count_p'] = df['is_t'] * is_blinded
df['ct_blinded_count_p'] = df['is_ct'] * is_blinded
df['t_blind_time_sum_p'] = df['is_t'] * is_alive_int * df['flash_duration']
df['ct_blind_time_sum_p'] = df['is_ct'] * is_alive_int * df['flash_duration']
df['ct_defuser_count_p'] = df['is_ct'] * is_alive_int * df['has_defuser']
df['t_helmet_count_p'] = df['is_t'] * is_alive_int * df['has_helmet']
df['ct_helmet_count_p'] = df['is_ct'] * is_alive_int * df['has_helmet']
df['t_armor_sum_p'] = df['is_t'] * is_alive_int * df['armor_value']
df['ct_armor_sum_p'] = df['is_ct'] * is_alive_int * df['armor_value']
# Aggregate per frame # Aggregate per frame
group_cols = ['match_id', 'map_name', 'round', 'tick', 'round_winner', 'is_bomb_planted', 'site'] group_cols = ['match_id', 'map_name', 'round', 'tick', 'round_winner', 'is_bomb_planted', 'site']
@@ -124,6 +154,36 @@ def preprocess_features(df):
# Note: 'round_winner' is in group_cols because it's constant per group # Note: 'round_winner' is in group_cols because it's constant per group
features_df = df.groupby(group_cols).agg(agg_funcs).reset_index() features_df = df.groupby(group_cols).agg(agg_funcs).reset_index()
utility_agg = (
df.groupby(['match_id', 'round', 'tick'])
.agg({
't_blinded_count_p': 'sum',
'ct_blinded_count_p': 'sum',
't_blind_time_sum_p': 'sum',
'ct_blind_time_sum_p': 'sum',
'ct_defuser_count_p': 'sum',
't_helmet_count_p': 'sum',
'ct_helmet_count_p': 'sum',
't_armor_sum_p': 'sum',
'ct_armor_sum_p': 'sum'
})
.reset_index()
.rename(columns={
't_blinded_count_p': 't_blinded_count',
'ct_blinded_count_p': 'ct_blinded_count',
't_blind_time_sum_p': 't_blind_time_sum',
'ct_blind_time_sum_p': 'ct_blind_time_sum',
'ct_defuser_count_p': 'ct_defuser_count',
't_helmet_count_p': 't_helmet_count',
'ct_helmet_count_p': 'ct_helmet_count',
't_armor_sum_p': 't_armor_sum',
'ct_armor_sum_p': 'ct_armor_sum'
})
)
utility_agg['ct_has_defuser'] = (utility_agg['ct_defuser_count'] > 0).astype(int)
utility_agg['blinded_diff'] = utility_agg['ct_blinded_count'] - utility_agg['t_blinded_count']
# 3. Add derived features # 3. Add derived features
features_df['health_diff'] = features_df['ct_health'] - features_df['t_health'] features_df['health_diff'] = features_df['ct_health'] - features_df['t_health']
features_df['alive_diff'] = features_df['ct_alive'] - features_df['t_alive'] features_df['alive_diff'] = features_df['ct_alive'] - features_df['t_alive']
@@ -140,6 +200,7 @@ def preprocess_features(df):
# Keys: match_id, round, tick # Keys: match_id, round, tick
features_df = pd.merge(features_df, spatial_features, on=['match_id', 'round', 'tick'], how='left') features_df = pd.merge(features_df, spatial_features, on=['match_id', 'round', 'tick'], how='left')
features_df = pd.merge(features_df, economy_features, on=['match_id', 'round', 'tick'], how='left') features_df = pd.merge(features_df, economy_features, on=['match_id', 'round', 'tick'], how='left')
features_df = pd.merge(features_df, utility_agg, on=['match_id', 'round', 'tick'], how='left')
rating_map = {} rating_map = {}
try: try:
@@ -237,7 +298,7 @@ def train_model(df):
"""Train XGBoost Classifier.""" """Train XGBoost Classifier."""
# Features (X) and Target (y) # Features (X) and Target (y)
feature_cols = FEATURE_COLUMNS feature_cols = XGB_FEATURE_COLUMNS_V2
target_col = 'round_winner' target_col = 'round_winner'
logging.info(f"Training features: {feature_cols}") logging.info(f"Training features: {feature_cols}")
@@ -288,7 +349,7 @@ def train_model(df):
model.fit(X_train, y_train) model.fit(X_train, y_train)
# Save Test Set for Evaluation Script # Save Test Set for Evaluation Script
test_set_path = os.path.join("data", "processed", "test_set.parquet") test_set_path = os.path.join("data", "processed", "test_set_v2.parquet")
logging.info(f"Saving validation set to {test_set_path}...") logging.info(f"Saving validation set to {test_set_path}...")
test_df.to_parquet(test_set_path) test_df.to_parquet(test_set_path)
@@ -309,6 +370,8 @@ def main():
os.makedirs(MODEL_DIR) os.makedirs(MODEL_DIR)
try: try:
model_path = os.getenv("CLUTCH_XGB_MODEL_PATH", MODEL_PATH)
# 1. Load # 1. Load
raw_df = load_data(DATA_DIR) raw_df = load_data(DATA_DIR)
@@ -323,8 +386,8 @@ def main():
model = train_model(features_df) model = train_model(features_df)
# 4. Save # 4. Save
model.save_model(MODEL_PATH) model.save_model(model_path)
logging.info(f"Model saved to {MODEL_PATH}") logging.info(f"Model saved to {model_path}")
# 5. Save player experience map for inference (optional) # 5. Save player experience map for inference (optional)
if 'steamid' in raw_df.columns: if 'steamid' in raw_df.columns:

View File

@@ -1,207 +0,0 @@
"""
Train Attention-LSTM Model for Clutch-IQ
"""
import os
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, classification_report
# Ensure we can import from src
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from src.training.models import ClutchAttentionLSTM
# Config
SEQ_DIR = os.path.join("data", "sequences")
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, "clutch_attention_lstm_v1.pth")
BATCH_SIZE = 32
EPOCHS = 50
LR = 0.001
PATIENCE = 10
class EarlyStopping:
def __init__(self, patience=5, min_delta=0):
self.patience = patience
self.min_delta = min_delta
self.counter = 0
self.best_loss = None
self.early_stop = False
def __call__(self, val_loss):
if self.best_loss is None:
self.best_loss = val_loss
elif val_loss > self.best_loss - self.min_delta:
self.counter += 1
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_loss = val_loss
self.counter = 0
def train_lstm():
if not os.path.exists(MODEL_DIR):
os.makedirs(MODEL_DIR)
# 1. Load Data
x_path = os.path.join(SEQ_DIR, "X_seq.npy")
y_path = os.path.join(SEQ_DIR, "y_seq.npy")
m_path = os.path.join(SEQ_DIR, "matches_seq.npy")
if not os.path.exists(x_path) or not os.path.exists(y_path):
print(f"Data not found at {SEQ_DIR}. Please run src/training/sequence_prep.py first.")
return
print("Loading sequence data...")
X = np.load(x_path)
y = np.load(y_path)
# Load match IDs if available, else warn and use random split
if os.path.exists(m_path):
matches = np.load(m_path)
print(f"Loaded match IDs. Shape: {matches.shape}")
else:
print("Warning: matches_seq.npy not found. Using random split (risk of leakage).")
matches = None
print(f"Data Shape: X={X.shape}, y={y.shape}")
# 2. Split
if matches is not None:
# GroupSplit
unique_matches = np.unique(matches)
print(f"Total unique matches: {len(unique_matches)}")
# Shuffle matches
np.random.seed(42)
np.random.shuffle(unique_matches)
n_train = int(len(unique_matches) * 0.8)
train_match_ids = unique_matches[:n_train]
test_match_ids = unique_matches[n_train:]
print(f"Train matches: {len(train_match_ids)}, Test matches: {len(test_match_ids)}")
train_mask = np.isin(matches, train_match_ids)
test_mask = np.isin(matches, test_match_ids)
X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]
else:
# Stratify is important for imbalanced datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")
# 3. Convert to PyTorch Tensors
train_data = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
test_data = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)
# 4. Model Setup
input_dim = X.shape[2] # Number of features
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on {device}...")
# Initialize Attention LSTM with higher dropout and lower complexity
model = ClutchAttentionLSTM(input_dim=input_dim, hidden_dim=64, num_layers=2, dropout=0.5).to(device)
criterion = nn.BCELoss()
# Add weight decay for L2 regularization
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
early_stopping = EarlyStopping(patience=PATIENCE, min_delta=0.0001)
# 5. Train Loop
best_loss = float('inf')
print("-" * 50)
print(f"{'Epoch':<6} | {'Train Loss':<12} | {'Val Loss':<12} | {'Val Acc':<10} | {'LR':<10}")
print("-" * 50)
for epoch in range(EPOCHS):
model.train()
train_loss = 0.0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
labels = labels.unsqueeze(1) # (batch, 1)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
train_loss += loss.item() * inputs.size(0)
train_loss /= len(train_loader.dataset)
# Validation
model.eval()
val_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
labels = labels.unsqueeze(1)
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item() * inputs.size(0)
predicted = (outputs > 0.5).float()
total += labels.size(0)
correct += (predicted == labels).sum().item()
val_loss /= len(test_loader.dataset)
val_acc = correct / total
current_lr = optimizer.param_groups[0]['lr']
print(f"{epoch+1:<6} | {train_loss:.4f} | {val_loss:.4f} | {val_acc:.2%} | {current_lr:.1e}")
# Scheduler & Checkpointing
scheduler.step(val_loss)
if val_loss < best_loss:
best_loss = val_loss
torch.save(model.state_dict(), MODEL_PATH)
# print(f" -> Model saved (Val Loss: {val_loss:.4f})")
# Early Stopping
early_stopping(val_loss)
if early_stopping.early_stop:
print("Early stopping triggered!")
break
print("Training Complete.")
# 6. Final Evaluation
print(f"Loading best model from {MODEL_PATH}...")
model.load_state_dict(torch.load(MODEL_PATH))
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
preds = (outputs.squeeze() > 0.5).float()
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
print("-" * 50)
print("Detailed Classification Report (Best Model):")
print(classification_report(all_labels, all_preds, target_names=['T (Terrorist)', 'CT (Counter-Terrorist)']))
print("="*50)
if __name__ == "__main__":
train_lstm()