feat: Implement L1A incremental refresh
This commit is contained in:
27
ETL/L1A.py
27
ETL/L1A.py
@@ -2,6 +2,7 @@ import os
|
||||
import json
|
||||
import sqlite3
|
||||
import glob
|
||||
import argparse # Added
|
||||
|
||||
# Paths
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
@@ -26,17 +27,33 @@ def init_db():
|
||||
return conn
|
||||
|
||||
def process_files():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--force', action='store_true', help='Force reprocessing of all files')
|
||||
args = parser.parse_args()
|
||||
|
||||
conn = init_db()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get existing match_ids to skip
|
||||
existing_ids = set()
|
||||
if not args.force:
|
||||
try:
|
||||
cursor.execute("SELECT match_id FROM raw_iframe_network")
|
||||
existing_ids = set(row[0] for row in cursor.fetchall())
|
||||
print(f"Found {len(existing_ids)} existing matches in DB. Incremental mode active.")
|
||||
except Exception as e:
|
||||
print(f"Error checking existing data: {e}")
|
||||
|
||||
# Pattern to match all iframe_network.json files
|
||||
# output_arena/*/iframe_network.json
|
||||
pattern = os.path.join(OUTPUT_ARENA_DIR, '*', 'iframe_network.json')
|
||||
files = glob.glob(pattern)
|
||||
|
||||
print(f"Found {len(files)} files to process.")
|
||||
print(f"Found {len(files)} files in directory.")
|
||||
|
||||
count = 0
|
||||
skipped = 0
|
||||
|
||||
for file_path in files:
|
||||
try:
|
||||
# Extract match_id from directory name
|
||||
@@ -44,6 +61,10 @@ def process_files():
|
||||
parent_dir = os.path.dirname(file_path)
|
||||
match_id = os.path.basename(parent_dir)
|
||||
|
||||
if match_id in existing_ids:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
@@ -63,7 +84,7 @@ def process_files():
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print(f"Finished processing {count} files.")
|
||||
print(f"Finished. Processed: {count}, Skipped: {skipped}.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
process_files()
|
||||
process_files()
|
||||
Reference in New Issue
Block a user