262 lines
8.7 KiB
Python
262 lines
8.7 KiB
Python
"""
|
|
L3 Feature Distribution Checker
|
|
|
|
Analyzes data quality issues:
|
|
- NaN/NULL values
|
|
- All values identical (no variance)
|
|
- Extreme outliers
|
|
- Zero-only columns
|
|
"""
|
|
|
|
import sqlite3
|
|
import sys
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
import math
|
|
import os
|
|
|
|
# Set UTF-8 encoding for Windows
|
|
if sys.platform == 'win32':
|
|
import io
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
L3_DB_PATH = project_root / "database" / "L3" / "L3.db"
|
|
|
|
|
|
def get_column_stats(cursor, table_name):
|
|
"""Get statistics for all numeric columns in a table"""
|
|
|
|
# Get column names
|
|
cursor.execute(f"PRAGMA table_info({table_name})")
|
|
columns = cursor.fetchall()
|
|
|
|
# Filter to numeric columns (skip steam_id_64, TEXT columns)
|
|
numeric_cols = []
|
|
for col in columns:
|
|
col_name = col[1]
|
|
col_type = col[2]
|
|
if col_name != 'steam_id_64' and col_type in ('REAL', 'INTEGER'):
|
|
numeric_cols.append(col_name)
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"Table: {table_name}")
|
|
print(f"Analyzing {len(numeric_cols)} numeric columns...")
|
|
print(f"{'='*80}\n")
|
|
|
|
issues_found = defaultdict(list)
|
|
|
|
for col in numeric_cols:
|
|
# Get basic statistics
|
|
cursor.execute(f"""
|
|
SELECT
|
|
COUNT(*) as total_count,
|
|
COUNT({col}) as non_null_count,
|
|
MIN({col}) as min_val,
|
|
MAX({col}) as max_val,
|
|
AVG({col}) as avg_val,
|
|
COUNT(DISTINCT {col}) as unique_count
|
|
FROM {table_name}
|
|
""")
|
|
|
|
row = cursor.fetchone()
|
|
total = row[0]
|
|
non_null = row[1]
|
|
min_val = row[2]
|
|
max_val = row[3]
|
|
avg_val = row[4]
|
|
unique = row[5]
|
|
|
|
null_count = total - non_null
|
|
null_pct = (null_count / total * 100) if total > 0 else 0
|
|
|
|
# Check for issues
|
|
|
|
# Issue 1: High NULL percentage
|
|
if null_pct > 50:
|
|
issues_found['HIGH_NULL'].append({
|
|
'column': col,
|
|
'null_pct': null_pct,
|
|
'null_count': null_count,
|
|
'total': total
|
|
})
|
|
|
|
# Issue 2: All values identical (no variance)
|
|
if non_null > 0 and unique == 1:
|
|
issues_found['NO_VARIANCE'].append({
|
|
'column': col,
|
|
'value': min_val,
|
|
'count': non_null
|
|
})
|
|
|
|
# Issue 3: All zeros
|
|
if non_null > 0 and min_val == 0 and max_val == 0:
|
|
issues_found['ALL_ZEROS'].append({
|
|
'column': col,
|
|
'count': non_null
|
|
})
|
|
|
|
# Issue 4: NaN values (in SQLite, NaN is stored as NULL or text 'nan')
|
|
cursor.execute(f"""
|
|
SELECT COUNT(*) FROM {table_name}
|
|
WHERE CAST({col} AS TEXT) = 'nan' OR {col} IS NULL
|
|
""")
|
|
nan_count = cursor.fetchone()[0]
|
|
if nan_count > non_null * 0.1: # More than 10% NaN
|
|
issues_found['NAN_VALUES'].append({
|
|
'column': col,
|
|
'nan_count': nan_count,
|
|
'pct': (nan_count / total * 100)
|
|
})
|
|
|
|
# Issue 5: Extreme outliers (using IQR method)
|
|
if non_null > 10 and unique > 2: # Need enough data
|
|
cursor.execute(f"""
|
|
WITH ranked AS (
|
|
SELECT {col},
|
|
ROW_NUMBER() OVER (ORDER BY {col}) as rn,
|
|
COUNT(*) OVER () as total
|
|
FROM {table_name}
|
|
WHERE {col} IS NOT NULL
|
|
)
|
|
SELECT
|
|
(SELECT {col} FROM ranked WHERE rn = CAST(total * 0.25 AS INTEGER)) as q1,
|
|
(SELECT {col} FROM ranked WHERE rn = CAST(total * 0.75 AS INTEGER)) as q3
|
|
FROM ranked
|
|
LIMIT 1
|
|
""")
|
|
|
|
quartiles = cursor.fetchone()
|
|
if quartiles and quartiles[0] is not None and quartiles[1] is not None:
|
|
q1, q3 = quartiles
|
|
iqr = q3 - q1
|
|
|
|
if iqr > 0:
|
|
lower_bound = q1 - 1.5 * iqr
|
|
upper_bound = q3 + 1.5 * iqr
|
|
|
|
cursor.execute(f"""
|
|
SELECT COUNT(*) FROM {table_name}
|
|
WHERE {col} < ? OR {col} > ?
|
|
""", (lower_bound, upper_bound))
|
|
|
|
outlier_count = cursor.fetchone()[0]
|
|
outlier_pct = (outlier_count / non_null * 100) if non_null > 0 else 0
|
|
|
|
if outlier_pct > 5: # More than 5% outliers
|
|
issues_found['OUTLIERS'].append({
|
|
'column': col,
|
|
'outlier_count': outlier_count,
|
|
'outlier_pct': outlier_pct,
|
|
'q1': q1,
|
|
'q3': q3,
|
|
'iqr': iqr
|
|
})
|
|
|
|
# Print summary for columns with good data
|
|
if col not in [item['column'] for sublist in issues_found.values() for item in sublist]:
|
|
if non_null > 0 and min_val is not None:
|
|
print(f"✓ {col:45s} | Min: {min_val:10.3f} | Max: {max_val:10.3f} | "
|
|
f"Avg: {avg_val:10.3f} | Unique: {unique:6d}")
|
|
|
|
return issues_found
|
|
|
|
|
|
def print_issues(issues_found):
|
|
"""Print detailed issue report"""
|
|
|
|
if not any(issues_found.values()):
|
|
print(f"\n{'='*80}")
|
|
print("✅ NO DATA QUALITY ISSUES FOUND!")
|
|
print(f"{'='*80}\n")
|
|
return
|
|
|
|
print(f"\n{'='*80}")
|
|
print("⚠️ DATA QUALITY ISSUES DETECTED")
|
|
print(f"{'='*80}\n")
|
|
|
|
# HIGH NULL
|
|
if issues_found['HIGH_NULL']:
|
|
print(f"❌ HIGH NULL PERCENTAGE ({len(issues_found['HIGH_NULL'])} columns):")
|
|
for issue in issues_found['HIGH_NULL']:
|
|
print(f" - {issue['column']:45s}: {issue['null_pct']:6.2f}% NULL "
|
|
f"({issue['null_count']}/{issue['total']})")
|
|
print()
|
|
|
|
# NO VARIANCE
|
|
if issues_found['NO_VARIANCE']:
|
|
print(f"❌ NO VARIANCE - All values identical ({len(issues_found['NO_VARIANCE'])} columns):")
|
|
for issue in issues_found['NO_VARIANCE']:
|
|
print(f" - {issue['column']:45s}: All {issue['count']} values = {issue['value']}")
|
|
print()
|
|
|
|
# ALL ZEROS
|
|
if issues_found['ALL_ZEROS']:
|
|
print(f"❌ ALL ZEROS ({len(issues_found['ALL_ZEROS'])} columns):")
|
|
for issue in issues_found['ALL_ZEROS']:
|
|
print(f" - {issue['column']:45s}: All {issue['count']} values are 0")
|
|
print()
|
|
|
|
# NAN VALUES
|
|
if issues_found['NAN_VALUES']:
|
|
print(f"❌ NAN/NULL VALUES ({len(issues_found['NAN_VALUES'])} columns):")
|
|
for issue in issues_found['NAN_VALUES']:
|
|
print(f" - {issue['column']:45s}: {issue['nan_count']} NaN/NULL ({issue['pct']:.2f}%)")
|
|
print()
|
|
|
|
# OUTLIERS
|
|
if issues_found['OUTLIERS']:
|
|
print(f"⚠️ EXTREME OUTLIERS ({len(issues_found['OUTLIERS'])} columns):")
|
|
for issue in issues_found['OUTLIERS']:
|
|
print(f" - {issue['column']:45s}: {issue['outlier_count']} outliers ({issue['outlier_pct']:.2f}%) "
|
|
f"[Q1={issue['q1']:.2f}, Q3={issue['q3']:.2f}, IQR={issue['iqr']:.2f}]")
|
|
print()
|
|
|
|
|
|
def main():
|
|
"""Main entry point"""
|
|
|
|
if not L3_DB_PATH.exists():
|
|
print(f"❌ L3 database not found at: {L3_DB_PATH}")
|
|
return 1
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"L3 Feature Distribution Checker")
|
|
print(f"Database: {L3_DB_PATH}")
|
|
print(f"{'='*80}")
|
|
|
|
conn = sqlite3.connect(L3_DB_PATH)
|
|
cursor = conn.cursor()
|
|
|
|
# Get row count
|
|
cursor.execute("SELECT COUNT(*) FROM dm_player_features")
|
|
total_players = cursor.fetchone()[0]
|
|
print(f"\nTotal players: {total_players}")
|
|
|
|
# Check dm_player_features table
|
|
issues = get_column_stats(cursor, 'dm_player_features')
|
|
print_issues(issues)
|
|
|
|
# Summary statistics
|
|
print(f"\n{'='*80}")
|
|
print("SUMMARY")
|
|
print(f"{'='*80}")
|
|
print(f"Total Issues Found:")
|
|
print(f" - High NULL percentage: {len(issues['HIGH_NULL'])}")
|
|
print(f" - No variance (all same): {len(issues['NO_VARIANCE'])}")
|
|
print(f" - All zeros: {len(issues['ALL_ZEROS'])}")
|
|
print(f" - NaN/NULL values: {len(issues['NAN_VALUES'])}")
|
|
print(f" - Extreme outliers: {len(issues['OUTLIERS'])}")
|
|
print()
|
|
|
|
conn.close()
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|