0.3:L1A finished.
This commit is contained in:
69
ETL/L1A.py
Normal file
69
ETL/L1A.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import os
|
||||
import json
|
||||
import sqlite3
|
||||
import glob
|
||||
|
||||
# Paths
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
OUTPUT_ARENA_DIR = os.path.join(BASE_DIR, 'output_arena')
|
||||
DB_DIR = os.path.join(BASE_DIR, 'database', 'L1A')
|
||||
DB_PATH = os.path.join(DB_DIR, 'L1A.sqlite')
|
||||
|
||||
def init_db():
|
||||
if not os.path.exists(DB_DIR):
|
||||
os.makedirs(DB_DIR)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS raw_iframe_network (
|
||||
match_id TEXT PRIMARY KEY,
|
||||
content TEXT,
|
||||
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
return conn
|
||||
|
||||
def process_files():
|
||||
conn = init_db()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Pattern to match all iframe_network.json files
|
||||
# output_arena/*/iframe_network.json
|
||||
pattern = os.path.join(OUTPUT_ARENA_DIR, '*', 'iframe_network.json')
|
||||
files = glob.glob(pattern)
|
||||
|
||||
print(f"Found {len(files)} files to process.")
|
||||
|
||||
count = 0
|
||||
for file_path in files:
|
||||
try:
|
||||
# Extract match_id from directory name
|
||||
# file_path is like .../output_arena/g161-xxx/iframe_network.json
|
||||
parent_dir = os.path.dirname(file_path)
|
||||
match_id = os.path.basename(parent_dir)
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Upsert data
|
||||
cursor.execute('''
|
||||
INSERT OR REPLACE INTO raw_iframe_network (match_id, content)
|
||||
VALUES (?, ?)
|
||||
''', (match_id, content))
|
||||
|
||||
count += 1
|
||||
if count % 100 == 0:
|
||||
print(f"Processed {count} files...")
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path}: {e}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print(f"Finished processing {count} files.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
process_files()
|
||||
3
ETL/README.md
Normal file
3
ETL/README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
L1A output_arena/iframe_network.json -> L1A.sqlite(Primary Key: match_id)
|
||||
L1B
|
||||
L2 L1A.sqlite -> L2.sqlite
|
||||
BIN
database/L1A/L1A.sqlite
Normal file
BIN
database/L1A/L1A.sqlite
Normal file
Binary file not shown.
@@ -1,4 +1,16 @@
|
||||
L1A 5eplay平台网页爬虫原始数据。
|
||||
ETL Step 1:
|
||||
|
||||
## ETL Step 1:
|
||||
从原始json数据库提取到L1A级数据库中。
|
||||
output_arena/*/iframe_network.json -> database/L1A/L1A.sqlite
|
||||
`output_arena/*/iframe_network.json` -> `database/L1A/L1A.sqlite`
|
||||
|
||||
### 脚本说明
|
||||
- **脚本位置**: `ETL/L1A.py`
|
||||
- **功能**: 自动遍历 `output_arena` 目录下所有的 `iframe_network.json` 文件,提取原始内容并以 `match_id` (文件夹名) 为主键存入 `L1A.sqlite` 数据库的 `raw_iframe_network` 表中。
|
||||
|
||||
### 运行方式
|
||||
使用项目指定的 Python 环境运行脚本:
|
||||
|
||||
```bash
|
||||
C:/ProgramData/anaconda3/python.exe ETL/L1A.py
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user