diff --git a/ETL/L1A.py b/ETL/L1A.py new file mode 100644 index 0000000..37c42ed --- /dev/null +++ b/ETL/L1A.py @@ -0,0 +1,69 @@ +import os +import json +import sqlite3 +import glob + +# Paths +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +OUTPUT_ARENA_DIR = os.path.join(BASE_DIR, 'output_arena') +DB_DIR = os.path.join(BASE_DIR, 'database', 'L1A') +DB_PATH = os.path.join(DB_DIR, 'L1A.sqlite') + +def init_db(): + if not os.path.exists(DB_DIR): + os.makedirs(DB_DIR) + + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute(''' + CREATE TABLE IF NOT EXISTS raw_iframe_network ( + match_id TEXT PRIMARY KEY, + content TEXT, + processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + ''') + conn.commit() + return conn + +def process_files(): + conn = init_db() + cursor = conn.cursor() + + # Pattern to match all iframe_network.json files + # output_arena/*/iframe_network.json + pattern = os.path.join(OUTPUT_ARENA_DIR, '*', 'iframe_network.json') + files = glob.glob(pattern) + + print(f"Found {len(files)} files to process.") + + count = 0 + for file_path in files: + try: + # Extract match_id from directory name + # file_path is like .../output_arena/g161-xxx/iframe_network.json + parent_dir = os.path.dirname(file_path) + match_id = os.path.basename(parent_dir) + + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Upsert data + cursor.execute(''' + INSERT OR REPLACE INTO raw_iframe_network (match_id, content) + VALUES (?, ?) + ''', (match_id, content)) + + count += 1 + if count % 100 == 0: + print(f"Processed {count} files...") + conn.commit() + + except Exception as e: + print(f"Error processing {file_path}: {e}") + + conn.commit() + conn.close() + print(f"Finished processing {count} files.") + +if __name__ == '__main__': + process_files() diff --git a/ETL/README.md b/ETL/README.md new file mode 100644 index 0000000..55f49c7 --- /dev/null +++ b/ETL/README.md @@ -0,0 +1,3 @@ +L1A output_arena/iframe_network.json -> L1A.sqlite(Primary Key: match_id) +L1B +L2 L1A.sqlite -> L2.sqlite \ No newline at end of file diff --git a/database/L1A/L1A.sqlite b/database/L1A/L1A.sqlite new file mode 100644 index 0000000..f31ebce Binary files /dev/null and b/database/L1A/L1A.sqlite differ diff --git a/database/L1A/README.md b/database/L1A/README.md index 2789c95..137bf6b 100644 --- a/database/L1A/README.md +++ b/database/L1A/README.md @@ -1,4 +1,16 @@ L1A 5eplay平台网页爬虫原始数据。 -ETL Step 1: + +## ETL Step 1: 从原始json数据库提取到L1A级数据库中。 -output_arena/*/iframe_network.json -> database/L1A/L1A.sqlite \ No newline at end of file +`output_arena/*/iframe_network.json` -> `database/L1A/L1A.sqlite` + +### 脚本说明 +- **脚本位置**: `ETL/L1A.py` +- **功能**: 自动遍历 `output_arena` 目录下所有的 `iframe_network.json` 文件,提取原始内容并以 `match_id` (文件夹名) 为主键存入 `L1A.sqlite` 数据库的 `raw_iframe_network` 表中。 + +### 运行方式 +使用项目指定的 Python 环境运行脚本: + +```bash +C:/ProgramData/anaconda3/python.exe ETL/L1A.py +```