SWEBench updates.

This commit is contained in:
AI Christianson 2024-12-30 14:01:14 -05:00
parent 8b7eb76be6
commit 1e2917990e
1 changed files with 210 additions and 33 deletions

View File

@ -3,10 +3,32 @@
Script to generate predictions for SWE-bench Lite (princeton-nlp/SWE-bench_Lite). Script to generate predictions for SWE-bench Lite (princeton-nlp/SWE-bench_Lite).
This script: This script:
- Loads the SWE-bench Lite dataset - Loads the SWE-bench Lite dataset
- Clones each repo at the specified commit - For each instance, clones the repo at the specified commit into a user-defined projects directory
- Creates a dedicated Python virtual environment in the cloned repo using 'uv venv'
(the default system Python is used unless overridden in the `PYTHON_VERSION_OVERRIDES` dictionary)
- Installs `ra-aid` (in editable mode) plus any project dependencies from:
- pyproject.toml (pip install .)
- requirements.txt
- requirements-dev.txt
- Forms a prompt from the instance fields (problem_statement, FAIL_TO_PASS, PASS_TO_PASS) - Forms a prompt from the instance fields (problem_statement, FAIL_TO_PASS, PASS_TO_PASS)
- Calls ra-aid to create a patch - Calls ra-aid (from the venv) to create a patch
- Writes out predictions in the required JSON format - Writes out predictions in the required JSON format
Additionally, we provide an internal dictionary for per-project Python version overrides:
e.g.:
PYTHON_VERSION_OVERRIDES = {
"org/repo": "3.9",
"some-other-org/another-repo": "3.10",
}
If a repo name is not found in that dictionary, this script will just use the default system Python.
Required parameters:
--projects-dir : Directory where all repos are cloned.
Optional parameters:
--cleanup : If set, remove the cloned repos after processing.
""" """
import argparse import argparse
@ -15,7 +37,6 @@ import logging
import shutil import shutil
import subprocess import subprocess
import sys import sys
import tempfile
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Optional, Tuple, Dict, Any, List from typing import Optional, Tuple, Dict, Any, List
@ -26,6 +47,14 @@ from rich.logging import RichHandler
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
# If you'd like to override Python versions for specific repos:
# For example: "pandas-dev/pandas": "3.9"
PYTHON_VERSION_OVERRIDES = {
# "org/repo": "3.9",
# "another-org/another-repo": "3.10",
}
def setup_logging(log_dir: Path, verbose: bool = False) -> None: def setup_logging(log_dir: Path, verbose: bool = False) -> None:
"""Configure logging with both file and console handlers.""" """Configure logging with both file and console handlers."""
log_dir.mkdir(parents=True, exist_ok=True) log_dir.mkdir(parents=True, exist_ok=True)
@ -71,14 +100,122 @@ def create_output_dirs() -> Tuple[Path, Path]:
return base_dir, log_dir return base_dir, log_dir
def install_local_raaid(pip_path: Path) -> None:
"""
Install ra-aid (in editable mode) into the local environment.
We assume that this script lives in <repo_root>/scripts, so the
root directory is one level up from __file__.
"""
script_dir = Path(__file__).resolve().parent
repo_root = script_dir.parent # one level up
try:
subprocess.run(
[str(pip_path), "install", "-e", str(repo_root)],
cwd=str(repo_root),
check=True
)
except Exception as e:
logging.error(f"Failed to install ra-aid in editable mode from {repo_root}: {e}")
def setup_repo_venv(repo_dir: Path, repo_name: str) -> Path:
"""
Create a Python virtual environment in `repo_dir/.venv` using `uv venv`.
Installs:
- local ra-aid (editable mode)
- pyproject.toml => pip install .
- requirements.txt => pip install -r ...
- requirements-dev.txt => pip install -r ...
Steps to determine Python version:
1) Check the PYTHON_VERSION_OVERRIDES dict for the given repo_name.
If found, use that as the --python=<version> argument.
2) Otherwise, let uv pick the default system Python.
Returns:
Path to the .venv directory
"""
venv_dir = repo_dir / ".venv"
# Check for Python version override
python_version = PYTHON_VERSION_OVERRIDES.get(repo_name, None)
# Construct the uv command
uv_cmd = ["uv", "venv"]
if python_version:
uv_cmd.append(f"--python={python_version}")
uv_cmd.append(str(venv_dir))
try:
subprocess.run(uv_cmd, cwd=repo_dir, check=True)
except Exception as e:
logging.error(f"Failed to create venv in {repo_dir} using uv: {e}")
return venv_dir # Return anyway for partial info
pip_path = venv_dir / "bin" / "pip"
# Upgrade pip
try:
subprocess.run(
[str(pip_path), "install", "--upgrade", "pip"],
cwd=repo_dir,
check=False
)
except Exception as e:
logging.error(f"Failed to upgrade pip in {venv_dir}: {e}")
# 1) Install ra-aid in editable mode from our local repo
install_local_raaid(pip_path)
# 2) If pyproject.toml is present, install local project
pyproject_path = repo_dir / "pyproject.toml"
if pyproject_path.is_file():
try:
subprocess.run(
[str(pip_path), "install", "."],
cwd=repo_dir,
check=True
)
except Exception as e:
logging.error(f"Failed to install project from pyproject.toml in {repo_dir}: {e}")
# 3) If requirements.txt is present
req_path = repo_dir / "requirements.txt"
if req_path.is_file():
try:
subprocess.run(
[str(pip_path), "install", "-r", str(req_path)],
cwd=repo_dir,
check=True
)
except Exception as e:
logging.error(f"Failed to install from requirements.txt: {e}")
# 4) If requirements-dev.txt is present
req_dev_path = repo_dir / "requirements-dev.txt"
if req_dev_path.is_file():
try:
subprocess.run(
[str(pip_path), "install", "-r", str(req_dev_path)],
cwd=repo_dir,
check=True
)
except Exception as e:
logging.error(f"Failed to install from requirements-dev.txt: {e}")
return venv_dir
def run_raaid( def run_raaid(
repo_dir: Path, repo_dir: Path,
venv_dir: Path,
problem_statement: str, problem_statement: str,
fail_tests: List[str], fail_tests: List[str],
pass_tests: List[str] pass_tests: List[str]
) -> Optional[str]: ) -> Optional[str]:
"""Run ra-aid on the problem statement, returning a generated patch if possible.""" """
# Create prompt Run ra-aid on the problem statement (using the local venv), returning a generated patch if possible.
"""
prompt = f"{problem_statement}\n\nTests that need to be fixed:\n```\n" prompt = f"{problem_statement}\n\nTests that need to be fixed:\n```\n"
for t in fail_tests: for t in fail_tests:
prompt += f"- {t}\n" prompt += f"- {t}\n"
@ -89,9 +226,10 @@ def run_raaid(
prompt += f"- {t}\n" prompt += f"- {t}\n"
prompt += "```\n\n" prompt += "```\n\n"
# Implementation phase # Use ra-aid from the local venv
raaid_exe = venv_dir / "bin" / "ra-aid"
impl_cmd = [ impl_cmd = [
'ra-aid', str(raaid_exe),
'--cowboy-mode', '--cowboy-mode',
'-m', prompt, '-m', prompt,
] ]
@ -116,7 +254,6 @@ def run_raaid(
logging.error(f"ra-aid error: {e}") logging.error(f"ra-aid error: {e}")
return None return None
# Collect patch
repo = Repo(repo_dir) repo = Repo(repo_dir)
patch = get_git_patch(repo) patch = get_git_patch(repo)
return patch return patch
@ -139,11 +276,17 @@ def get_git_patch(repo: Repo) -> Optional[str]:
return None return None
def process_instance(instance: Dict[str, Any], output_repo_dir: Path) -> Dict[str, Any]: def process_instance(
instance: Dict[str, Any],
projects_dir: Path,
cleanup: bool
) -> Dict[str, Any]:
""" """
Process a single dataset instance: Process a single dataset instance:
- Clone the repo - Clone the repo into projects_dir/<instance_id>
- Checkout commit - Checkout commit
- Build a local Python venv in that repo (checking override dict)
- Install ra-aid + any project dependencies
- Build prompt from problem_statement, FAIL_TO_PASS, PASS_TO_PASS - Build prompt from problem_statement, FAIL_TO_PASS, PASS_TO_PASS
- Return dict in required format: - Return dict in required format:
{ {
@ -151,6 +294,7 @@ def process_instance(instance: Dict[str, Any], output_repo_dir: Path) -> Dict[st
"model_patch": ..., "model_patch": ...,
"model_name_or_path": ... "model_name_or_path": ...
} }
- If cleanup is True, remove the cloned repo after generating a patch
""" """
inst_id = instance.get("instance_id", "<unknown>") inst_id = instance.get("instance_id", "<unknown>")
repo_name = instance["repo"] repo_name = instance["repo"]
@ -159,37 +303,53 @@ def process_instance(instance: Dict[str, Any], output_repo_dir: Path) -> Dict[st
fail_tests = instance.get("FAIL_TO_PASS", []) fail_tests = instance.get("FAIL_TO_PASS", [])
pass_tests = instance.get("PASS_TO_PASS", []) pass_tests = instance.get("PASS_TO_PASS", [])
# Convert to lists if they're strings
if isinstance(fail_tests, str): if isinstance(fail_tests, str):
fail_tests = [fail_tests] fail_tests = [fail_tests]
if isinstance(pass_tests, str): if isinstance(pass_tests, str):
pass_tests = [pass_tests] pass_tests = [pass_tests]
# Attempt to build a github url if not provided # Build GitHub URL
# If 'repo' is "org/repo", create https://github.com/org/repo.git
if "github.com" not in repo_name: if "github.com" not in repo_name:
repo_url = f"https://github.com/{repo_name}.git" repo_url = f"https://github.com/{repo_name}.git"
else: else:
repo_url = repo_name repo_url = repo_name
checkout_dir = projects_dir / f"{inst_id}"
patch_str = None patch_str = None
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
try: try:
# Clone & checkout if checkout_dir.exists():
repo = Repo.clone_from(repo_url, tmp_path) logging.info(f"Removing pre-existing directory: {checkout_dir}")
shutil.rmtree(checkout_dir)
# Clone and checkout
repo = Repo.clone_from(repo_url, checkout_dir)
repo.git.checkout(commit) repo.git.checkout(commit)
# Set up local Python venv & install dependencies
venv_dir = setup_repo_venv(checkout_dir, repo_name=repo_name)
# Run ra-aid
patch_str = run_raaid(
checkout_dir,
venv_dir,
problem_statement,
fail_tests,
pass_tests
)
except Exception as e: except Exception as e:
logging.error(f"Failed to clone/check out {repo_url}:{commit} - {e}") logging.error(f"Failed to process {repo_url}:{commit} - {e}")
return { return {
"instance_id": inst_id, "instance_id": inst_id,
"model_patch": "", "model_patch": "",
"model_name_or_path": "ra-aid" "model_name_or_path": "ra-aid"
} }
# Run ra-aid finally:
patch_str = run_raaid(tmp_path, problem_statement, fail_tests, pass_tests) if cleanup:
logging.info(f"Cleaning up directory: {checkout_dir}")
shutil.rmtree(checkout_dir, ignore_errors=True)
# Return required prediction structure
return { return {
"instance_id": inst_id, "instance_id": inst_id,
"model_patch": patch_str if patch_str else "", "model_patch": patch_str if patch_str else "",
@ -217,17 +377,30 @@ def main() -> None:
default=None, default=None,
help="Number of instances to process (default: all)" help="Number of instances to process (default: all)"
) )
parser.add_argument(
"--projects-dir",
type=Path,
required=True,
help="Directory where projects will be cloned. Must exist or can be created."
)
parser.add_argument(
"--cleanup",
action="store_true",
help="If set, remove the cloned repos after generating the patch."
)
args = parser.parse_args() args = parser.parse_args()
base_dir, log_dir = create_output_dirs() base_dir, log_dir = create_output_dirs()
setup_logging(log_dir, args.verbose) setup_logging(log_dir, args.verbose)
logging.info("Starting script") logging.info("Starting script")
args.projects_dir.mkdir(parents=True, exist_ok=True)
dataset = load_dataset_safely() dataset = load_dataset_safely()
if dataset is None: if dataset is None:
sys.exit(1) sys.exit(1)
# Combine "dev" and "test" splits (no "train" in this dataset) # Combine 'dev' and 'test' splits for this dataset (there is no 'train')
all_data = list(dataset["dev"]) + list(dataset["test"]) all_data = list(dataset["dev"]) + list(dataset["test"])
args.output_dir.mkdir(parents=True, exist_ok=True) args.output_dir.mkdir(parents=True, exist_ok=True)
@ -247,7 +420,11 @@ def main() -> None:
if i >= limit: if i >= limit:
break break
try: try:
pred = process_instance(inst, args.output_dir) pred = process_instance(
inst,
projects_dir=args.projects_dir,
cleanup=args.cleanup
)
predictions.append(pred) predictions.append(pred)
except Exception as e: except Exception as e:
logging.error(f"Error processing instance: {inst.get('instance_id', '')} - {e}") logging.error(f"Error processing instance: {inst.get('instance_id', '')} - {e}")