import sqlite3 import pandas as pd import numpy as np import sys # 设置pandas显示选项,确保不省略任何行和列 pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) pd.set_option('display.width', 2000) pd.set_option('display.float_format', '{:.2f}'.format) pd.set_option('display.max_colwidth', None) db_path = 'database/L2/L2_Main.sqlite' def check_all_tables(): conn = sqlite3.connect(db_path) # 获取所有表名 tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'", conn)['name'].tolist() for table in tables: print(f"\n{'='*20} Table: {table} {'='*20}") # 获取表的所有列 cols_info = pd.read_sql(f"PRAGMA table_info({table})", conn) cols = cols_info['name'].tolist() # 读取全表数据 df = pd.read_sql(f"SELECT * FROM {table}", conn) total = len(df) if total == 0: print(f"Table is empty (0 rows)") continue print(f"Total Rows: {total}") print("-" * 60) stats = [] for col in cols: # 1. Null Check nulls = df[col].isnull().sum() # 2. Zero Check (仅对数值型或可转换为数值的列) zeros = 0 try: # 尝试转为数值,无法转换的变为NaN numeric_series = pd.to_numeric(df[col], errors='coerce') # 统计0值 (排除原本就是NaN的) zeros = (numeric_series == 0).sum() except: zeros = 0 # 3. Unique Count (基数) unique_count = df[col].nunique() # 4. Example Value (取第一个非空值) example = df[col].dropna().iloc[0] if df[col].count() > 0 else 'ALL NULL' stats.append({ 'Field': col, 'Nulls': nulls, 'Null%': (nulls/total)*100, 'Zeros': zeros, 'Zero%': (zeros/total)*100, 'Unique': unique_count, 'Example': str(example)[:50] # 截断过长示例 }) # 输出完整统计表 df_stats = pd.DataFrame(stats) # 按 Zero% 降序排列,但保证 Null% 高的也显眼,这里默认不排序直接按字段序,或者按关注度排序 # 用户要求全面探查,按字段原序输出可能更符合直觉,或者按Zero%排序 # 这里为了排查问题,按 Zero% 降序输出 df_stats = df_stats.sort_values('Zero%', ascending=False) print(df_stats.to_string(index=False)) print("\n") conn.close() if __name__ == "__main__": check_all_tables()