feat: Initial commit of Clutch-IQ project

This commit is contained in:
xunyulin230420
2026-02-05 23:26:03 +08:00
commit a355239861
66 changed files with 12922 additions and 0 deletions

View File

@@ -0,0 +1,261 @@
"""
L3 Feature Distribution Checker
Analyzes data quality issues:
- NaN/NULL values
- All values identical (no variance)
- Extreme outliers
- Zero-only columns
"""
import sqlite3
import sys
from pathlib import Path
from collections import defaultdict
import math
import os
# Set UTF-8 encoding for Windows
if sys.platform == 'win32':
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
# Add project root to path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
L3_DB_PATH = project_root / "database" / "L3" / "L3.db"
def get_column_stats(cursor, table_name):
"""Get statistics for all numeric columns in a table"""
# Get column names
cursor.execute(f"PRAGMA table_info({table_name})")
columns = cursor.fetchall()
# Filter to numeric columns (skip steam_id_64, TEXT columns)
numeric_cols = []
for col in columns:
col_name = col[1]
col_type = col[2]
if col_name != 'steam_id_64' and col_type in ('REAL', 'INTEGER'):
numeric_cols.append(col_name)
print(f"\n{'='*80}")
print(f"Table: {table_name}")
print(f"Analyzing {len(numeric_cols)} numeric columns...")
print(f"{'='*80}\n")
issues_found = defaultdict(list)
for col in numeric_cols:
# Get basic statistics
cursor.execute(f"""
SELECT
COUNT(*) as total_count,
COUNT({col}) as non_null_count,
MIN({col}) as min_val,
MAX({col}) as max_val,
AVG({col}) as avg_val,
COUNT(DISTINCT {col}) as unique_count
FROM {table_name}
""")
row = cursor.fetchone()
total = row[0]
non_null = row[1]
min_val = row[2]
max_val = row[3]
avg_val = row[4]
unique = row[5]
null_count = total - non_null
null_pct = (null_count / total * 100) if total > 0 else 0
# Check for issues
# Issue 1: High NULL percentage
if null_pct > 50:
issues_found['HIGH_NULL'].append({
'column': col,
'null_pct': null_pct,
'null_count': null_count,
'total': total
})
# Issue 2: All values identical (no variance)
if non_null > 0 and unique == 1:
issues_found['NO_VARIANCE'].append({
'column': col,
'value': min_val,
'count': non_null
})
# Issue 3: All zeros
if non_null > 0 and min_val == 0 and max_val == 0:
issues_found['ALL_ZEROS'].append({
'column': col,
'count': non_null
})
# Issue 4: NaN values (in SQLite, NaN is stored as NULL or text 'nan')
cursor.execute(f"""
SELECT COUNT(*) FROM {table_name}
WHERE CAST({col} AS TEXT) = 'nan' OR {col} IS NULL
""")
nan_count = cursor.fetchone()[0]
if nan_count > non_null * 0.1: # More than 10% NaN
issues_found['NAN_VALUES'].append({
'column': col,
'nan_count': nan_count,
'pct': (nan_count / total * 100)
})
# Issue 5: Extreme outliers (using IQR method)
if non_null > 10 and unique > 2: # Need enough data
cursor.execute(f"""
WITH ranked AS (
SELECT {col},
ROW_NUMBER() OVER (ORDER BY {col}) as rn,
COUNT(*) OVER () as total
FROM {table_name}
WHERE {col} IS NOT NULL
)
SELECT
(SELECT {col} FROM ranked WHERE rn = CAST(total * 0.25 AS INTEGER)) as q1,
(SELECT {col} FROM ranked WHERE rn = CAST(total * 0.75 AS INTEGER)) as q3
FROM ranked
LIMIT 1
""")
quartiles = cursor.fetchone()
if quartiles and quartiles[0] is not None and quartiles[1] is not None:
q1, q3 = quartiles
iqr = q3 - q1
if iqr > 0:
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
cursor.execute(f"""
SELECT COUNT(*) FROM {table_name}
WHERE {col} < ? OR {col} > ?
""", (lower_bound, upper_bound))
outlier_count = cursor.fetchone()[0]
outlier_pct = (outlier_count / non_null * 100) if non_null > 0 else 0
if outlier_pct > 5: # More than 5% outliers
issues_found['OUTLIERS'].append({
'column': col,
'outlier_count': outlier_count,
'outlier_pct': outlier_pct,
'q1': q1,
'q3': q3,
'iqr': iqr
})
# Print summary for columns with good data
if col not in [item['column'] for sublist in issues_found.values() for item in sublist]:
if non_null > 0 and min_val is not None:
print(f"{col:45s} | Min: {min_val:10.3f} | Max: {max_val:10.3f} | "
f"Avg: {avg_val:10.3f} | Unique: {unique:6d}")
return issues_found
def print_issues(issues_found):
"""Print detailed issue report"""
if not any(issues_found.values()):
print(f"\n{'='*80}")
print("✅ NO DATA QUALITY ISSUES FOUND!")
print(f"{'='*80}\n")
return
print(f"\n{'='*80}")
print("⚠️ DATA QUALITY ISSUES DETECTED")
print(f"{'='*80}\n")
# HIGH NULL
if issues_found['HIGH_NULL']:
print(f"❌ HIGH NULL PERCENTAGE ({len(issues_found['HIGH_NULL'])} columns):")
for issue in issues_found['HIGH_NULL']:
print(f" - {issue['column']:45s}: {issue['null_pct']:6.2f}% NULL "
f"({issue['null_count']}/{issue['total']})")
print()
# NO VARIANCE
if issues_found['NO_VARIANCE']:
print(f"❌ NO VARIANCE - All values identical ({len(issues_found['NO_VARIANCE'])} columns):")
for issue in issues_found['NO_VARIANCE']:
print(f" - {issue['column']:45s}: All {issue['count']} values = {issue['value']}")
print()
# ALL ZEROS
if issues_found['ALL_ZEROS']:
print(f"❌ ALL ZEROS ({len(issues_found['ALL_ZEROS'])} columns):")
for issue in issues_found['ALL_ZEROS']:
print(f" - {issue['column']:45s}: All {issue['count']} values are 0")
print()
# NAN VALUES
if issues_found['NAN_VALUES']:
print(f"❌ NAN/NULL VALUES ({len(issues_found['NAN_VALUES'])} columns):")
for issue in issues_found['NAN_VALUES']:
print(f" - {issue['column']:45s}: {issue['nan_count']} NaN/NULL ({issue['pct']:.2f}%)")
print()
# OUTLIERS
if issues_found['OUTLIERS']:
print(f"⚠️ EXTREME OUTLIERS ({len(issues_found['OUTLIERS'])} columns):")
for issue in issues_found['OUTLIERS']:
print(f" - {issue['column']:45s}: {issue['outlier_count']} outliers ({issue['outlier_pct']:.2f}%) "
f"[Q1={issue['q1']:.2f}, Q3={issue['q3']:.2f}, IQR={issue['iqr']:.2f}]")
print()
def main():
"""Main entry point"""
if not L3_DB_PATH.exists():
print(f"❌ L3 database not found at: {L3_DB_PATH}")
return 1
print(f"\n{'='*80}")
print(f"L3 Feature Distribution Checker")
print(f"Database: {L3_DB_PATH}")
print(f"{'='*80}")
conn = sqlite3.connect(L3_DB_PATH)
cursor = conn.cursor()
# Get row count
cursor.execute("SELECT COUNT(*) FROM dm_player_features")
total_players = cursor.fetchone()[0]
print(f"\nTotal players: {total_players}")
# Check dm_player_features table
issues = get_column_stats(cursor, 'dm_player_features')
print_issues(issues)
# Summary statistics
print(f"\n{'='*80}")
print("SUMMARY")
print(f"{'='*80}")
print(f"Total Issues Found:")
print(f" - High NULL percentage: {len(issues['HIGH_NULL'])}")
print(f" - No variance (all same): {len(issues['NO_VARIANCE'])}")
print(f" - All zeros: {len(issues['ALL_ZEROS'])}")
print(f" - NaN/NULL values: {len(issues['NAN_VALUES'])}")
print(f" - Extreme outliers: {len(issues['OUTLIERS'])}")
print()
conn.close()
return 0
if __name__ == '__main__':
sys.exit(main())