From 4fca32a508824db9117b21251edf8c5dbb3a9ce3 Mon Sep 17 00:00:00 2001
From: AI Christianson <ai.christianson@christianson.ai>
Date: Tue, 31 Dec 2024 08:36:19 -0500
Subject: [PATCH] SWEBench updates.

---
 scripts/generate_swebench_dataset.py | 79 +++++++++++++++++++---------
 1 file changed, 54 insertions(+), 25 deletions(-)

diff --git a/scripts/generate_swebench_dataset.py b/scripts/generate_swebench_dataset.py
index 56ab201..70a7be3 100755
--- a/scripts/generate_swebench_dataset.py
+++ b/scripts/generate_swebench_dataset.py
@@ -10,6 +10,8 @@ It:
 - Installs `ra-aid` in editable mode + any project dependencies via `uv pip`
 - Calls `uv run ra-aid` to generate a patch
 - Writes out predictions in JSON format
+
+No progress bar or spinner is used, allowing `ra-aid` output to stream directly.
 """
 
 import argparse
@@ -24,13 +26,13 @@ from typing import Optional, Tuple, Dict, Any, List
 
 from git import Repo
 from rich.logging import RichHandler
-from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
 
 # If you'd like to override Python versions for specific repos:
 PYTHON_VERSION_OVERRIDES = {
     # "someorg/somerepo": "3.9",
 }
 
+
 def setup_logging(log_dir: Path, verbose: bool = False) -> None:
     """Configure logging with both file and console handlers."""
     log_dir.mkdir(parents=True, exist_ok=True)
@@ -55,6 +57,7 @@ def setup_logging(log_dir: Path, verbose: bool = False) -> None:
     console_handler.setLevel(logging.DEBUG if verbose else logging.INFO)
     root_logger.addHandler(console_handler)
 
+
 def load_dataset_safely() -> Optional[Any]:
     """Load SWE-bench Lite dataset with error handling."""
     try:
@@ -65,6 +68,7 @@ def load_dataset_safely() -> Optional[Any]:
         logging.error(f"Failed to load dataset: {e}")
         return None
 
+
 def create_output_dirs() -> Tuple[Path, Path]:
     """Create base/log directory structure."""
     date_str = datetime.now().strftime("%Y%m%d")
@@ -74,6 +78,7 @@ def create_output_dirs() -> Tuple[Path, Path]:
     log_dir.mkdir(parents=True, exist_ok=True)
     return base_dir, log_dir
 
+
 def uv_venv(repo_dir: Path, repo_name: str, force_venv: bool) -> None:
     """
     Create (or reuse) a .venv in 'repo_dir' using 'uv venv'.
@@ -87,7 +92,7 @@ def uv_venv(repo_dir: Path, repo_name: str, force_venv: bool) -> None:
         logging.info(f"Removing existing .venv at {venv_dir}")
         shutil.rmtree(venv_dir)
 
-    python_version = PYTHON_VERSION_OVERRIDES.get(repo_name, None)
+    python_version = PYTHON_VERSION_OVERRIDES.get(repo_name, None) or "3.12"
     cmd = ["uv", "venv"]
     if python_version:
         cmd.append(f"--python={python_version}")
@@ -98,6 +103,7 @@ def uv_venv(repo_dir: Path, repo_name: str, force_venv: bool) -> None:
     except Exception as e:
         logging.error(f"Failed to create venv in {repo_dir}: {e}")
 
+
 def uv_pip_install(repo_dir: Path, args: List[str]) -> None:
     """
     Run 'uv pip install ...' in the specified repo_dir.
@@ -109,9 +115,11 @@ def uv_pip_install(repo_dir: Path, args: List[str]) -> None:
     except Exception as e:
         logging.error(f"Failed to run uv pip install {args}: {e}")
 
+
 def uv_run_raaid(repo_dir: Path, prompt: str) -> Optional[str]:
     """
-    Call 'uv run ra-aid' with the given prompt in the environment.
+    Call 'uv run ra-aid' with the given prompt in the environment,
+    streaming output directly to the console (capture_output=False).
     Returns the patch if successful, else None.
     """
     cmd = [
@@ -119,12 +127,16 @@ def uv_run_raaid(repo_dir: Path, prompt: str) -> Optional[str]:
         "--cowboy-mode",
         "-m", prompt
     ]
+    # We are NOT capturing output, so it streams live:
     try:
-        result = subprocess.run(cmd, cwd=repo_dir, text=True, capture_output=True, timeout=300)
+        result = subprocess.run(
+            cmd,
+            cwd=repo_dir,
+            text=True,
+            check=False,   # We manually handle exit code
+        )
         if result.returncode != 0:
             logging.error("ra-aid returned non-zero exit code.")
-            logging.debug(f"stdout: {result.stdout}")
-            logging.debug(f"stderr: {result.stderr}")
             return None
     except subprocess.TimeoutExpired:
         logging.error("ra-aid timed out")
@@ -137,6 +149,7 @@ def uv_run_raaid(repo_dir: Path, prompt: str) -> Optional[str]:
     patch = get_git_patch(repo_dir)
     return patch
 
+
 def get_git_patch(repo_dir: Path) -> Optional[str]:
     """Generate a git patch from the current changes in `repo_dir`."""
     try:
@@ -154,6 +167,7 @@ def get_git_patch(repo_dir: Path) -> Optional[str]:
         logging.error(f"Failed to generate patch: {e}")
         return None
 
+
 def setup_venv_and_deps(repo_dir: Path, repo_name: str, force_venv: bool) -> None:
     """
     - uv venv .venv --python=xxx (optional)
@@ -189,6 +203,7 @@ def setup_venv_and_deps(repo_dir: Path, repo_name: str, force_venv: bool) -> Non
     if req_dev_file.is_file():
         uv_pip_install(repo_dir, ["-r", "requirements-dev.txt"])
 
+
 def build_prompt(problem_statement: str, fail_tests: List[str], pass_tests: List[str]) -> str:
     """
     Construct the prompt text from problem_statement, FAIL_TO_PASS, PASS_TO_PASS.
@@ -202,14 +217,25 @@ def build_prompt(problem_statement: str, fail_tests: List[str], pass_tests: List
         for t in pass_tests:
             prompt += f"- {t}\n"
         prompt += "```\n\n"
+    prompt += "\n\nYou must run all relevant tests both before and after making changes, and ensure they pass as you do your work."
     return prompt
 
+
 def process_instance(
     instance: Dict[str, Any],
     projects_dir: Path,
     reuse_repo: bool,
     force_venv: bool
 ) -> Dict[str, Any]:
+    """
+    Process a single dataset instance without a progress bar/spinner.
+    - Clone or reuse the repo at projects_dir/<instance_id>
+    - Checkout commit
+    - Create or reuse a .venv in that repo
+    - Install ra-aid + any project dependencies
+    - Build prompt, run ra-aid (output streamed to console)
+    - Return prediction dict
+    """
     inst_id = instance.get("instance_id", "<unknown>")
     repo_name = instance["repo"]
     commit = instance["base_commit"]
@@ -222,7 +248,6 @@ def process_instance(
     if isinstance(pass_tests, str):
         pass_tests = [pass_tests]
 
-    # Build GH URL
     if "github.com" not in repo_name:
         repo_url = f"https://github.com/{repo_name}.git"
     else:
@@ -230,13 +255,11 @@ def process_instance(
 
     checkout_dir = projects_dir / f"{inst_id}"
 
-    # Clone or reuse
     try:
         if not checkout_dir.exists():
             logging.info(f"Cloning {repo_url} -> {checkout_dir}")
             repo = Repo.clone_from(repo_url, checkout_dir)
         else:
-            # if reuse_repo
             if reuse_repo:
                 logging.info(f"Reusing existing directory: {checkout_dir}")
                 repo = Repo(checkout_dir)
@@ -245,7 +268,7 @@ def process_instance(
                 shutil.rmtree(checkout_dir)
                 repo = Repo.clone_from(repo_url, checkout_dir)
 
-        # checkout commit
+        # checkout correct commit
         repo.git.checkout(commit)
 
         # set up venv + deps
@@ -269,8 +292,11 @@ def process_instance(
             "model_name_or_path": "ra-aid"
         }
 
+
 def main() -> None:
-    parser = argparse.ArgumentParser(description="Generate predictions for SWE-bench Lite using uv + ra-aid.")
+    parser = argparse.ArgumentParser(
+        description="Generate predictions for SWE-bench Lite using uv + ra-aid (no progress bar)."
+    )
     parser.add_argument(
         "output_dir",
         type=Path,
@@ -307,43 +333,46 @@ def main() -> None:
 
     from datasets import load_dataset
 
+    # Create base/log dirs and set up logging
     base_dir, log_dir = create_output_dirs()
     setup_logging(log_dir, args.verbose)
     logging.info("Starting script")
 
+    # Ensure projects dir
     args.projects_dir.mkdir(parents=True, exist_ok=True)
 
+    # Load dataset
     dataset = load_dataset_safely()
     if dataset is None:
         sys.exit(1)
 
+    # Combine dev + test
     all_data = list(dataset["dev"]) + list(dataset["test"])
 
+    # Ensure output dir
     args.output_dir.mkdir(parents=True, exist_ok=True)
     predictions_file = args.output_dir / "predictions.json"
     predictions: List[Dict[str, str]] = []
 
     limit = args.num_instances if args.num_instances else len(all_data)
 
-    with Progress(
-        SpinnerColumn(),
-        TextColumn("[progress.description]{task.description}"),
-        TimeElapsedColumn(),
-        transient=False
-    ) as progress:
-        task = progress.add_task("Processing instances...", total=limit)
-        for i, inst in enumerate(all_data):
-            if i >= limit:
-                break
-            pred = process_instance(inst, args.projects_dir, args.reuse_repo, args.force_venv)
-            predictions.append(pred)
-            progress.advance(task)
+    # Just a simple for loop - no progress bar
+    logging.info(f"Processing up to {limit} instances.")
+    for i, inst in enumerate(all_data):
+        if i >= limit:
+            break
 
+        logging.info(f"=== Instance {i+1}/{limit}, ID={inst.get('instance_id')} ===")
+        pred = process_instance(inst, args.projects_dir, args.reuse_repo, args.force_venv)
+        predictions.append(pred)
+
+    # Save predictions
     with open(predictions_file, "w", encoding="utf-8") as f:
         json.dump(predictions, f, indent=2)
 
     logging.info("Done generating predictions.")
 
+
 if __name__ == "__main__":
     try:
         main()
@@ -352,4 +381,4 @@ if __name__ == "__main__":
         sys.exit(1)
     except Exception as e:
         logging.exception("Unhandled error occurred.")
-        sys.exit(1)
+        sys.exit(1)
\ No newline at end of file