Files
yrtv/ETL/verify/verify_deep.py

83 lines
2.8 KiB
Python
Raw Normal View History

import sqlite3
import pandas as pd
import numpy as np
import sys
# 设置pandas显示选项确保不省略任何行和列
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_colwidth', None)
db_path = 'database/L2/L2_Main.sqlite'
def check_all_tables():
conn = sqlite3.connect(db_path)
# 获取所有表名
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'", conn)['name'].tolist()
for table in tables:
print(f"\n{'='*20} Table: {table} {'='*20}")
# 获取表的所有列
cols_info = pd.read_sql(f"PRAGMA table_info({table})", conn)
cols = cols_info['name'].tolist()
# 读取全表数据
df = pd.read_sql(f"SELECT * FROM {table}", conn)
total = len(df)
if total == 0:
print(f"Table is empty (0 rows)")
continue
print(f"Total Rows: {total}")
print("-" * 60)
stats = []
for col in cols:
# 1. Null Check
nulls = df[col].isnull().sum()
# 2. Zero Check (仅对数值型或可转换为数值的列)
zeros = 0
try:
# 尝试转为数值无法转换的变为NaN
numeric_series = pd.to_numeric(df[col], errors='coerce')
# 统计0值 (排除原本就是NaN的)
zeros = (numeric_series == 0).sum()
except:
zeros = 0
# 3. Unique Count (基数)
unique_count = df[col].nunique()
# 4. Example Value (取第一个非空值)
example = df[col].dropna().iloc[0] if df[col].count() > 0 else 'ALL NULL'
stats.append({
'Field': col,
'Nulls': nulls,
'Null%': (nulls/total)*100,
'Zeros': zeros,
'Zero%': (zeros/total)*100,
'Unique': unique_count,
'Example': str(example)[:50] # 截断过长示例
})
# 输出完整统计表
df_stats = pd.DataFrame(stats)
# 按 Zero% 降序排列,但保证 Null% 高的也显眼,这里默认不排序直接按字段序,或者按关注度排序
# 用户要求全面探查按字段原序输出可能更符合直觉或者按Zero%排序
# 这里为了排查问题,按 Zero% 降序输出
df_stats = df_stats.sort_values('Zero%', ascending=False)
print(df_stats.to_string(index=False))
print("\n")
conn.close()
if __name__ == "__main__":
check_all_tables()