feat: Implement L1A incremental refresh

2026-01-24 17:39:56 +08:00
parent 3a37755559
commit c787fef2d4
4 changed files with 174 additions and 3 deletions
--- a/ETL/L1A.py
+++ b/ETL/L1A.py
@@ -2,6 +2,7 @@ import os
 import json
 import sqlite3
 import glob
+import argparse  # Added

 # Paths
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -26,17 +27,33 @@ def init_db():
    return conn

 def process_files():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--force', action='store_true', help='Force reprocessing of all files')
+    args = parser.parse_args()
+
    conn = init_db()
    cursor = conn.cursor()
    
+    # Get existing match_ids to skip
+    existing_ids = set()
+    if not args.force:
+        try:
+            cursor.execute("SELECT match_id FROM raw_iframe_network")
+            existing_ids = set(row[0] for row in cursor.fetchall())
+            print(f"Found {len(existing_ids)} existing matches in DB. Incremental mode active.")
+        except Exception as e:
+            print(f"Error checking existing data: {e}")
+
    # Pattern to match all iframe_network.json files
    # output_arena/*/iframe_network.json
    pattern = os.path.join(OUTPUT_ARENA_DIR, '*', 'iframe_network.json')
    files = glob.glob(pattern)
    
-    print(f"Found {len(files)} files to process.")
+    print(f"Found {len(files)} files in directory.")
    
    count = 0
+    skipped = 0
+    
    for file_path in files:
        try:
            # Extract match_id from directory name
@@ -44,6 +61,10 @@ def process_files():
            parent_dir = os.path.dirname(file_path)
            match_id = os.path.basename(parent_dir)
            
+            if match_id in existing_ids:
+                skipped += 1
+                continue
+
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
@@ -63,7 +84,7 @@ def process_files():
            
    conn.commit()
    conn.close()
-    print(f"Finished processing {count} files.")
+    print(f"Finished. Processed: {count}, Skipped: {skipped}.")

 if __name__ == '__main__':
-    process_files()
+    process_files()