0.3:L1A finished.

2026-01-23 18:41:14 +08:00
parent 781e6d4022
commit f7afb9cfd2
4 changed files with 86 additions and 2 deletions
--- a/ETL/L1A.py
+++ b/ETL/L1A.py
@@ -0,0 +1,69 @@
 import os
 import json
 import sqlite3
 import glob
 # Paths
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 OUTPUT_ARENA_DIR = os.path.join(BASE_DIR, 'output_arena')
 DB_DIR = os.path.join(BASE_DIR, 'database', 'L1A')
 DB_PATH = os.path.join(DB_DIR, 'L1A.sqlite')
 def init_db():
    if not os.path.exists(DB_DIR):
        os.makedirs(DB_DIR)
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS raw_iframe_network (
            match_id TEXT PRIMARY KEY,
            content TEXT,
            processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    conn.commit()
    return conn
 def process_files():
    conn = init_db()
    cursor = conn.cursor()
    # Pattern to match all iframe_network.json files
    # output_arena/*/iframe_network.json
    pattern = os.path.join(OUTPUT_ARENA_DIR, '*', 'iframe_network.json')
    files = glob.glob(pattern)
    print(f"Found {len(files)} files to process.")
    count = 0
    for file_path in files:
        try:
            # Extract match_id from directory name
            # file_path is like .../output_arena/g161-xxx/iframe_network.json
            parent_dir = os.path.dirname(file_path)
            match_id = os.path.basename(parent_dir)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            # Upsert data
            cursor.execute('''
                INSERT OR REPLACE INTO raw_iframe_network (match_id, content)
                VALUES (?, ?)
            ''', (match_id, content))
            count += 1
            if count % 100 == 0:
                print(f"Processed {count} files...")
                conn.commit()
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    conn.commit()
    conn.close()
    print(f"Finished processing {count} files.")
 if __name__ == '__main__':
    process_files()
--- a/ETL/README.md
+++ b/ETL/README.md
@@ -0,0 +1,3 @@
 L1A output_arena/iframe_network.json -> L1A.sqlite(Primary Key: match_id)
 L1B
 L2 L1A.sqlite -> L2.sqlite
--- a/database/L1A/L1A.sqlite
+++ b/database/L1A/L1A.sqlite
--- a/database/L1A/README.md
+++ b/database/L1A/README.md
@@ -1,4 +1,16 @@
 L1A 5eplay平台网页爬虫原始数据。
-ETL Step 1:
+
 ## ETL Step 1:
 从原始json数据库提取到L1A级数据库中。
-output_arena/*/iframe_network.json -> database/L1A/L1A.sqlite
+`output_arena/*/iframe_network.json` -> `database/L1A/L1A.sqlite`
 ### 脚本说明
 - **脚本位置**: `ETL/L1A.py`
 - **功能**: 自动遍历 `output_arena` 目录下所有的 `iframe_network.json` 文件，提取原始内容并以 `match_id` (文件夹名) 为主键存入 `L1A.sqlite` 数据库的 `raw_iframe_network` 表中。
 ### 运行方式
 使用项目指定的 Python 环境运行脚本：
 ```bash
 C:/ProgramData/anaconda3/python.exe ETL/L1A.py
 ```