""" L3 Feature Distribution Checker Analyzes data quality issues: - NaN/NULL values - All values identical (no variance) - Extreme outliers - Zero-only columns """ import sqlite3 import sys from pathlib import Path from collections import defaultdict import math import os # Set UTF-8 encoding for Windows if sys.platform == 'win32': import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') # Add project root to path project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) L3_DB_PATH = project_root / "database" / "L3" / "L3.db" def get_column_stats(cursor, table_name): """Get statistics for all numeric columns in a table""" # Get column names cursor.execute(f"PRAGMA table_info({table_name})") columns = cursor.fetchall() # Filter to numeric columns (skip steam_id_64, TEXT columns) numeric_cols = [] for col in columns: col_name = col[1] col_type = col[2] if col_name != 'steam_id_64' and col_type in ('REAL', 'INTEGER'): numeric_cols.append(col_name) print(f"\n{'='*80}") print(f"Table: {table_name}") print(f"Analyzing {len(numeric_cols)} numeric columns...") print(f"{'='*80}\n") issues_found = defaultdict(list) for col in numeric_cols: # Get basic statistics cursor.execute(f""" SELECT COUNT(*) as total_count, COUNT({col}) as non_null_count, MIN({col}) as min_val, MAX({col}) as max_val, AVG({col}) as avg_val, COUNT(DISTINCT {col}) as unique_count FROM {table_name} """) row = cursor.fetchone() total = row[0] non_null = row[1] min_val = row[2] max_val = row[3] avg_val = row[4] unique = row[5] null_count = total - non_null null_pct = (null_count / total * 100) if total > 0 else 0 # Check for issues # Issue 1: High NULL percentage if null_pct > 50: issues_found['HIGH_NULL'].append({ 'column': col, 'null_pct': null_pct, 'null_count': null_count, 'total': total }) # Issue 2: All values identical (no variance) if non_null > 0 and unique == 1: issues_found['NO_VARIANCE'].append({ 'column': col, 'value': min_val, 'count': non_null }) # Issue 3: All zeros if non_null > 0 and min_val == 0 and max_val == 0: issues_found['ALL_ZEROS'].append({ 'column': col, 'count': non_null }) # Issue 4: NaN values (in SQLite, NaN is stored as NULL or text 'nan') cursor.execute(f""" SELECT COUNT(*) FROM {table_name} WHERE CAST({col} AS TEXT) = 'nan' OR {col} IS NULL """) nan_count = cursor.fetchone()[0] if nan_count > non_null * 0.1: # More than 10% NaN issues_found['NAN_VALUES'].append({ 'column': col, 'nan_count': nan_count, 'pct': (nan_count / total * 100) }) # Issue 5: Extreme outliers (using IQR method) if non_null > 10 and unique > 2: # Need enough data cursor.execute(f""" WITH ranked AS ( SELECT {col}, ROW_NUMBER() OVER (ORDER BY {col}) as rn, COUNT(*) OVER () as total FROM {table_name} WHERE {col} IS NOT NULL ) SELECT (SELECT {col} FROM ranked WHERE rn = CAST(total * 0.25 AS INTEGER)) as q1, (SELECT {col} FROM ranked WHERE rn = CAST(total * 0.75 AS INTEGER)) as q3 FROM ranked LIMIT 1 """) quartiles = cursor.fetchone() if quartiles and quartiles[0] is not None and quartiles[1] is not None: q1, q3 = quartiles iqr = q3 - q1 if iqr > 0: lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr cursor.execute(f""" SELECT COUNT(*) FROM {table_name} WHERE {col} < ? OR {col} > ? """, (lower_bound, upper_bound)) outlier_count = cursor.fetchone()[0] outlier_pct = (outlier_count / non_null * 100) if non_null > 0 else 0 if outlier_pct > 5: # More than 5% outliers issues_found['OUTLIERS'].append({ 'column': col, 'outlier_count': outlier_count, 'outlier_pct': outlier_pct, 'q1': q1, 'q3': q3, 'iqr': iqr }) # Print summary for columns with good data if col not in [item['column'] for sublist in issues_found.values() for item in sublist]: if non_null > 0 and min_val is not None: print(f"✓ {col:45s} | Min: {min_val:10.3f} | Max: {max_val:10.3f} | " f"Avg: {avg_val:10.3f} | Unique: {unique:6d}") return issues_found def print_issues(issues_found): """Print detailed issue report""" if not any(issues_found.values()): print(f"\n{'='*80}") print("✅ NO DATA QUALITY ISSUES FOUND!") print(f"{'='*80}\n") return print(f"\n{'='*80}") print("⚠️ DATA QUALITY ISSUES DETECTED") print(f"{'='*80}\n") # HIGH NULL if issues_found['HIGH_NULL']: print(f"❌ HIGH NULL PERCENTAGE ({len(issues_found['HIGH_NULL'])} columns):") for issue in issues_found['HIGH_NULL']: print(f" - {issue['column']:45s}: {issue['null_pct']:6.2f}% NULL " f"({issue['null_count']}/{issue['total']})") print() # NO VARIANCE if issues_found['NO_VARIANCE']: print(f"❌ NO VARIANCE - All values identical ({len(issues_found['NO_VARIANCE'])} columns):") for issue in issues_found['NO_VARIANCE']: print(f" - {issue['column']:45s}: All {issue['count']} values = {issue['value']}") print() # ALL ZEROS if issues_found['ALL_ZEROS']: print(f"❌ ALL ZEROS ({len(issues_found['ALL_ZEROS'])} columns):") for issue in issues_found['ALL_ZEROS']: print(f" - {issue['column']:45s}: All {issue['count']} values are 0") print() # NAN VALUES if issues_found['NAN_VALUES']: print(f"❌ NAN/NULL VALUES ({len(issues_found['NAN_VALUES'])} columns):") for issue in issues_found['NAN_VALUES']: print(f" - {issue['column']:45s}: {issue['nan_count']} NaN/NULL ({issue['pct']:.2f}%)") print() # OUTLIERS if issues_found['OUTLIERS']: print(f"⚠️ EXTREME OUTLIERS ({len(issues_found['OUTLIERS'])} columns):") for issue in issues_found['OUTLIERS']: print(f" - {issue['column']:45s}: {issue['outlier_count']} outliers ({issue['outlier_pct']:.2f}%) " f"[Q1={issue['q1']:.2f}, Q3={issue['q3']:.2f}, IQR={issue['iqr']:.2f}]") print() def main(): """Main entry point""" if not L3_DB_PATH.exists(): print(f"❌ L3 database not found at: {L3_DB_PATH}") return 1 print(f"\n{'='*80}") print(f"L3 Feature Distribution Checker") print(f"Database: {L3_DB_PATH}") print(f"{'='*80}") conn = sqlite3.connect(L3_DB_PATH) cursor = conn.cursor() # Get row count cursor.execute("SELECT COUNT(*) FROM dm_player_features") total_players = cursor.fetchone()[0] print(f"\nTotal players: {total_players}") # Check dm_player_features table issues = get_column_stats(cursor, 'dm_player_features') print_issues(issues) # Summary statistics print(f"\n{'='*80}") print("SUMMARY") print(f"{'='*80}") print(f"Total Issues Found:") print(f" - High NULL percentage: {len(issues['HIGH_NULL'])}") print(f" - No variance (all same): {len(issues['NO_VARIANCE'])}") print(f" - All zeros: {len(issues['ALL_ZEROS'])}") print(f" - NaN/NULL values: {len(issues['NAN_VALUES'])}") print(f" - Extreme outliers: {len(issues['OUTLIERS'])}") print() conn.close() return 0 if __name__ == '__main__': sys.exit(main())