Files
yrtv/ETL/verify/verify_deep.py

83 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import sqlite3
import pandas as pd
import numpy as np
import sys
# 设置pandas显示选项确保不省略任何行和列
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_colwidth', None)
db_path = 'database/L2/L2_Main.sqlite'
def check_all_tables():
conn = sqlite3.connect(db_path)
# 获取所有表名
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'", conn)['name'].tolist()
for table in tables:
print(f"\n{'='*20} Table: {table} {'='*20}")
# 获取表的所有列
cols_info = pd.read_sql(f"PRAGMA table_info({table})", conn)
cols = cols_info['name'].tolist()
# 读取全表数据
df = pd.read_sql(f"SELECT * FROM {table}", conn)
total = len(df)
if total == 0:
print(f"Table is empty (0 rows)")
continue
print(f"Total Rows: {total}")
print("-" * 60)
stats = []
for col in cols:
# 1. Null Check
nulls = df[col].isnull().sum()
# 2. Zero Check (仅对数值型或可转换为数值的列)
zeros = 0
try:
# 尝试转为数值无法转换的变为NaN
numeric_series = pd.to_numeric(df[col], errors='coerce')
# 统计0值 (排除原本就是NaN的)
zeros = (numeric_series == 0).sum()
except:
zeros = 0
# 3. Unique Count (基数)
unique_count = df[col].nunique()
# 4. Example Value (取第一个非空值)
example = df[col].dropna().iloc[0] if df[col].count() > 0 else 'ALL NULL'
stats.append({
'Field': col,
'Nulls': nulls,
'Null%': (nulls/total)*100,
'Zeros': zeros,
'Zero%': (zeros/total)*100,
'Unique': unique_count,
'Example': str(example)[:50] # 截断过长示例
})
# 输出完整统计表
df_stats = pd.DataFrame(stats)
# 按 Zero% 降序排列,但保证 Null% 高的也显眼,这里默认不排序直接按字段序,或者按关注度排序
# 用户要求全面探查按字段原序输出可能更符合直觉或者按Zero%排序
# 这里为了排查问题,按 Zero% 降序输出
df_stats = df_stats.sort_values('Zero%', ascending=False)
print(df_stats.to_string(index=False))
print("\n")
conn.close()
if __name__ == "__main__":
check_all_tables()