feat: Initial commit of Clutch-IQ project
This commit is contained in:
190
src/etl/auto_pipeline.py
Normal file
190
src/etl/auto_pipeline.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""
|
||||
Clutch-IQ Auto Pipeline
|
||||
-----------------------
|
||||
This script continuously monitors the `data/demos` directory for new .dem files.
|
||||
When a new file appears, it:
|
||||
1. Waits for the file to be fully written (size stability check).
|
||||
2. Calls `src/etl/extract_snapshots.py` to process it.
|
||||
3. Deletes the source .dem file immediately after successful processing.
|
||||
|
||||
Usage:
|
||||
python src/etl/auto_pipeline.py
|
||||
|
||||
Stop:
|
||||
Press Ctrl+C to stop.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import subprocess
|
||||
import logging
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
# Configuration
|
||||
# Default to project demos folder, but can be overridden via CLI args
|
||||
DEFAULT_WATCH_DIR = os.path.abspath("data/demos")
|
||||
|
||||
# Target processing directory
|
||||
OUTPUT_DIR = os.path.abspath("data/processed")
|
||||
|
||||
CHECK_INTERVAL = 5 # Check every 5 seconds
|
||||
STABILITY_WAIT = 2 # Wait 2 seconds to check if file size changes
|
||||
EXTRACT_SCRIPT = os.path.join(os.path.dirname(__file__), "extract_snapshots.py")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - [AutoPipeline] - %(message)s',
|
||||
handlers=[logging.StreamHandler(sys.stdout)]
|
||||
)
|
||||
|
||||
def is_file_stable(filepath, wait_seconds=2):
|
||||
"""Check if file size is constant over a short period (indicates download finished)."""
|
||||
try:
|
||||
size1 = os.path.getsize(filepath)
|
||||
time.sleep(wait_seconds)
|
||||
size2 = os.path.getsize(filepath)
|
||||
return size1 == size2 and size1 > 0
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
def process_file(filepath):
|
||||
"""Run extraction script on a single file."""
|
||||
logging.info(f"Processing new file: {filepath}")
|
||||
|
||||
# We use subprocess to isolate memory usage and ensure clean state per file
|
||||
cmd = [
|
||||
sys.executable,
|
||||
EXTRACT_SCRIPT,
|
||||
"--demo_dir", os.path.dirname(filepath), # Temporarily point to where the file is
|
||||
"--output_dir", OUTPUT_DIR,
|
||||
"--delete-source" # Critical flag!
|
||||
]
|
||||
|
||||
try:
|
||||
# Note: extract_snapshots.py currently scans the whole dir.
|
||||
# This is inefficient if we monitor a busy Downloads folder.
|
||||
# Ideally we should pass the specific file path.
|
||||
# But for now, since we only care about .dem files and we delete them, it's okay.
|
||||
# However, to avoid processing other .dem files in Downloads that user might want to keep,
|
||||
# we should probably move it to a temp folder first?
|
||||
# Or better: Update extract_snapshots.py to accept a single file.
|
||||
|
||||
# For safety in "Downloads" folder scenario:
|
||||
# 1. Move file to data/demos (staging area)
|
||||
# 2. Process it there
|
||||
|
||||
staging_dir = os.path.abspath("data/demos")
|
||||
if not os.path.exists(staging_dir):
|
||||
os.makedirs(staging_dir)
|
||||
|
||||
filename = os.path.basename(filepath)
|
||||
staged_path = os.path.join(staging_dir, filename)
|
||||
|
||||
# If we are already in data/demos, no need to move
|
||||
if os.path.dirname(filepath) != staging_dir:
|
||||
logging.info(f"Moving {filename} to staging area...")
|
||||
try:
|
||||
os.rename(filepath, staged_path)
|
||||
except OSError as e:
|
||||
logging.error(f"Failed to move file: {e}")
|
||||
return
|
||||
else:
|
||||
staged_path = filepath
|
||||
|
||||
# Now process from staging
|
||||
cmd = [
|
||||
sys.executable,
|
||||
EXTRACT_SCRIPT,
|
||||
"--demo_dir", staging_dir,
|
||||
"--output_dir", OUTPUT_DIR,
|
||||
"--delete-source"
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
logging.info(f"Successfully processed batch.")
|
||||
logging.info(result.stdout)
|
||||
else:
|
||||
logging.error(f"Processing failed with code {result.returncode}")
|
||||
logging.error(result.stderr)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Execution error: {e}")
|
||||
|
||||
import threading
|
||||
|
||||
def monitor_loop(monitor_dir, stop_event=None):
|
||||
"""Core monitoring loop that can be run in a separate thread."""
|
||||
logging.info(f"Monitoring {monitor_dir} for new .dem files...")
|
||||
logging.info("Files will be MOVED to staging, PROCESSED, and then DELETED.")
|
||||
|
||||
while True:
|
||||
if stop_event and stop_event.is_set():
|
||||
logging.info("Stopping Auto Pipeline thread...")
|
||||
break
|
||||
|
||||
# List .dem files
|
||||
try:
|
||||
if not os.path.exists(monitor_dir):
|
||||
# Try to create it if it doesn't exist
|
||||
try:
|
||||
os.makedirs(monitor_dir)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
if os.path.exists(monitor_dir):
|
||||
files = [f for f in os.listdir(monitor_dir) if f.endswith('.dem')]
|
||||
else:
|
||||
files = []
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error accessing watch directory: {e}")
|
||||
time.sleep(CHECK_INTERVAL)
|
||||
continue
|
||||
|
||||
if files:
|
||||
logging.info(f"Found {len(files)} files pending in {monitor_dir}...")
|
||||
|
||||
# Sort by creation time (process oldest first)
|
||||
files.sort(key=lambda x: os.path.getctime(os.path.join(monitor_dir, x)))
|
||||
|
||||
for f in files:
|
||||
filepath = os.path.join(monitor_dir, f)
|
||||
|
||||
if not os.path.exists(filepath):
|
||||
continue
|
||||
|
||||
if is_file_stable(filepath, STABILITY_WAIT):
|
||||
process_file(filepath)
|
||||
else:
|
||||
logging.info(f"File {f} is still being written... skipping.")
|
||||
|
||||
time.sleep(CHECK_INTERVAL)
|
||||
|
||||
def start_background_monitor(watch_dir=DEFAULT_WATCH_DIR):
|
||||
"""Start the monitor in a background thread."""
|
||||
monitor_thread = threading.Thread(target=monitor_loop, args=(watch_dir,), daemon=True)
|
||||
monitor_thread.start()
|
||||
logging.info("Auto Pipeline service started in background.")
|
||||
return monitor_thread
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Auto Pipeline Monitor")
|
||||
parser.add_argument("--watch-dir", default=DEFAULT_WATCH_DIR, help="Directory to monitor for .dem files (e.g. C:/Users/Name/Downloads)")
|
||||
args = parser.parse_args()
|
||||
|
||||
monitor_dir = os.path.abspath(args.watch_dir)
|
||||
|
||||
if not os.path.exists(monitor_dir):
|
||||
logging.warning(f"Watch directory {monitor_dir} does not exist. Creating it...")
|
||||
os.makedirs(monitor_dir)
|
||||
|
||||
try:
|
||||
monitor_loop(monitor_dir)
|
||||
except KeyboardInterrupt:
|
||||
logging.info("Stopping Auto Pipeline...")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user