83 lines
2.8 KiB
Python
83 lines
2.8 KiB
Python
import sqlite3
|
||
import pandas as pd
|
||
import numpy as np
|
||
import sys
|
||
|
||
# 设置pandas显示选项,确保不省略任何行和列
|
||
pd.set_option('display.max_rows', None)
|
||
pd.set_option('display.max_columns', None)
|
||
pd.set_option('display.width', 2000)
|
||
pd.set_option('display.float_format', '{:.2f}'.format)
|
||
pd.set_option('display.max_colwidth', None)
|
||
|
||
db_path = 'database/L2/L2_Main.sqlite'
|
||
|
||
def check_all_tables():
|
||
conn = sqlite3.connect(db_path)
|
||
|
||
# 获取所有表名
|
||
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'", conn)['name'].tolist()
|
||
|
||
for table in tables:
|
||
print(f"\n{'='*20} Table: {table} {'='*20}")
|
||
|
||
# 获取表的所有列
|
||
cols_info = pd.read_sql(f"PRAGMA table_info({table})", conn)
|
||
cols = cols_info['name'].tolist()
|
||
|
||
# 读取全表数据
|
||
df = pd.read_sql(f"SELECT * FROM {table}", conn)
|
||
total = len(df)
|
||
|
||
if total == 0:
|
||
print(f"Table is empty (0 rows)")
|
||
continue
|
||
|
||
print(f"Total Rows: {total}")
|
||
print("-" * 60)
|
||
|
||
stats = []
|
||
for col in cols:
|
||
# 1. Null Check
|
||
nulls = df[col].isnull().sum()
|
||
|
||
# 2. Zero Check (仅对数值型或可转换为数值的列)
|
||
zeros = 0
|
||
try:
|
||
# 尝试转为数值,无法转换的变为NaN
|
||
numeric_series = pd.to_numeric(df[col], errors='coerce')
|
||
# 统计0值 (排除原本就是NaN的)
|
||
zeros = (numeric_series == 0).sum()
|
||
except:
|
||
zeros = 0
|
||
|
||
# 3. Unique Count (基数)
|
||
unique_count = df[col].nunique()
|
||
|
||
# 4. Example Value (取第一个非空值)
|
||
example = df[col].dropna().iloc[0] if df[col].count() > 0 else 'ALL NULL'
|
||
|
||
stats.append({
|
||
'Field': col,
|
||
'Nulls': nulls,
|
||
'Null%': (nulls/total)*100,
|
||
'Zeros': zeros,
|
||
'Zero%': (zeros/total)*100,
|
||
'Unique': unique_count,
|
||
'Example': str(example)[:50] # 截断过长示例
|
||
})
|
||
|
||
# 输出完整统计表
|
||
df_stats = pd.DataFrame(stats)
|
||
# 按 Zero% 降序排列,但保证 Null% 高的也显眼,这里默认不排序直接按字段序,或者按关注度排序
|
||
# 用户要求全面探查,按字段原序输出可能更符合直觉,或者按Zero%排序
|
||
# 这里为了排查问题,按 Zero% 降序输出
|
||
df_stats = df_stats.sort_values('Zero%', ascending=False)
|
||
print(df_stats.to_string(index=False))
|
||
print("\n")
|
||
|
||
conn.close()
|
||
|
||
if __name__ == "__main__":
|
||
check_all_tables()
|