SWEBench updates.
This commit is contained in:
parent
8b7eb76be6
commit
1e2917990e
|
|
@ -3,10 +3,32 @@
|
||||||
Script to generate predictions for SWE-bench Lite (princeton-nlp/SWE-bench_Lite).
|
Script to generate predictions for SWE-bench Lite (princeton-nlp/SWE-bench_Lite).
|
||||||
This script:
|
This script:
|
||||||
- Loads the SWE-bench Lite dataset
|
- Loads the SWE-bench Lite dataset
|
||||||
- Clones each repo at the specified commit
|
- For each instance, clones the repo at the specified commit into a user-defined projects directory
|
||||||
|
- Creates a dedicated Python virtual environment in the cloned repo using 'uv venv'
|
||||||
|
(the default system Python is used unless overridden in the `PYTHON_VERSION_OVERRIDES` dictionary)
|
||||||
|
- Installs `ra-aid` (in editable mode) plus any project dependencies from:
|
||||||
|
- pyproject.toml (pip install .)
|
||||||
|
- requirements.txt
|
||||||
|
- requirements-dev.txt
|
||||||
- Forms a prompt from the instance fields (problem_statement, FAIL_TO_PASS, PASS_TO_PASS)
|
- Forms a prompt from the instance fields (problem_statement, FAIL_TO_PASS, PASS_TO_PASS)
|
||||||
- Calls ra-aid to create a patch
|
- Calls ra-aid (from the venv) to create a patch
|
||||||
- Writes out predictions in the required JSON format
|
- Writes out predictions in the required JSON format
|
||||||
|
|
||||||
|
Additionally, we provide an internal dictionary for per-project Python version overrides:
|
||||||
|
e.g.:
|
||||||
|
|
||||||
|
PYTHON_VERSION_OVERRIDES = {
|
||||||
|
"org/repo": "3.9",
|
||||||
|
"some-other-org/another-repo": "3.10",
|
||||||
|
}
|
||||||
|
|
||||||
|
If a repo name is not found in that dictionary, this script will just use the default system Python.
|
||||||
|
|
||||||
|
Required parameters:
|
||||||
|
--projects-dir : Directory where all repos are cloned.
|
||||||
|
|
||||||
|
Optional parameters:
|
||||||
|
--cleanup : If set, remove the cloned repos after processing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
@ -15,7 +37,6 @@ import logging
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Tuple, Dict, Any, List
|
from typing import Optional, Tuple, Dict, Any, List
|
||||||
|
|
@ -26,6 +47,14 @@ from rich.logging import RichHandler
|
||||||
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
||||||
|
|
||||||
|
|
||||||
|
# If you'd like to override Python versions for specific repos:
|
||||||
|
# For example: "pandas-dev/pandas": "3.9"
|
||||||
|
PYTHON_VERSION_OVERRIDES = {
|
||||||
|
# "org/repo": "3.9",
|
||||||
|
# "another-org/another-repo": "3.10",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def setup_logging(log_dir: Path, verbose: bool = False) -> None:
|
def setup_logging(log_dir: Path, verbose: bool = False) -> None:
|
||||||
"""Configure logging with both file and console handlers."""
|
"""Configure logging with both file and console handlers."""
|
||||||
log_dir.mkdir(parents=True, exist_ok=True)
|
log_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
@ -71,14 +100,122 @@ def create_output_dirs() -> Tuple[Path, Path]:
|
||||||
return base_dir, log_dir
|
return base_dir, log_dir
|
||||||
|
|
||||||
|
|
||||||
|
def install_local_raaid(pip_path: Path) -> None:
|
||||||
|
"""
|
||||||
|
Install ra-aid (in editable mode) into the local environment.
|
||||||
|
We assume that this script lives in <repo_root>/scripts, so the
|
||||||
|
root directory is one level up from __file__.
|
||||||
|
"""
|
||||||
|
script_dir = Path(__file__).resolve().parent
|
||||||
|
repo_root = script_dir.parent # one level up
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
[str(pip_path), "install", "-e", str(repo_root)],
|
||||||
|
cwd=str(repo_root),
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to install ra-aid in editable mode from {repo_root}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def setup_repo_venv(repo_dir: Path, repo_name: str) -> Path:
|
||||||
|
"""
|
||||||
|
Create a Python virtual environment in `repo_dir/.venv` using `uv venv`.
|
||||||
|
Installs:
|
||||||
|
- local ra-aid (editable mode)
|
||||||
|
- pyproject.toml => pip install .
|
||||||
|
- requirements.txt => pip install -r ...
|
||||||
|
- requirements-dev.txt => pip install -r ...
|
||||||
|
|
||||||
|
Steps to determine Python version:
|
||||||
|
1) Check the PYTHON_VERSION_OVERRIDES dict for the given repo_name.
|
||||||
|
If found, use that as the --python=<version> argument.
|
||||||
|
2) Otherwise, let uv pick the default system Python.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the .venv directory
|
||||||
|
"""
|
||||||
|
venv_dir = repo_dir / ".venv"
|
||||||
|
|
||||||
|
# Check for Python version override
|
||||||
|
python_version = PYTHON_VERSION_OVERRIDES.get(repo_name, None)
|
||||||
|
|
||||||
|
# Construct the uv command
|
||||||
|
uv_cmd = ["uv", "venv"]
|
||||||
|
if python_version:
|
||||||
|
uv_cmd.append(f"--python={python_version}")
|
||||||
|
uv_cmd.append(str(venv_dir))
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.run(uv_cmd, cwd=repo_dir, check=True)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to create venv in {repo_dir} using uv: {e}")
|
||||||
|
return venv_dir # Return anyway for partial info
|
||||||
|
|
||||||
|
pip_path = venv_dir / "bin" / "pip"
|
||||||
|
|
||||||
|
# Upgrade pip
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
[str(pip_path), "install", "--upgrade", "pip"],
|
||||||
|
cwd=repo_dir,
|
||||||
|
check=False
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to upgrade pip in {venv_dir}: {e}")
|
||||||
|
|
||||||
|
# 1) Install ra-aid in editable mode from our local repo
|
||||||
|
install_local_raaid(pip_path)
|
||||||
|
|
||||||
|
# 2) If pyproject.toml is present, install local project
|
||||||
|
pyproject_path = repo_dir / "pyproject.toml"
|
||||||
|
if pyproject_path.is_file():
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
[str(pip_path), "install", "."],
|
||||||
|
cwd=repo_dir,
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to install project from pyproject.toml in {repo_dir}: {e}")
|
||||||
|
|
||||||
|
# 3) If requirements.txt is present
|
||||||
|
req_path = repo_dir / "requirements.txt"
|
||||||
|
if req_path.is_file():
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
[str(pip_path), "install", "-r", str(req_path)],
|
||||||
|
cwd=repo_dir,
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to install from requirements.txt: {e}")
|
||||||
|
|
||||||
|
# 4) If requirements-dev.txt is present
|
||||||
|
req_dev_path = repo_dir / "requirements-dev.txt"
|
||||||
|
if req_dev_path.is_file():
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
[str(pip_path), "install", "-r", str(req_dev_path)],
|
||||||
|
cwd=repo_dir,
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to install from requirements-dev.txt: {e}")
|
||||||
|
|
||||||
|
return venv_dir
|
||||||
|
|
||||||
|
|
||||||
def run_raaid(
|
def run_raaid(
|
||||||
repo_dir: Path,
|
repo_dir: Path,
|
||||||
|
venv_dir: Path,
|
||||||
problem_statement: str,
|
problem_statement: str,
|
||||||
fail_tests: List[str],
|
fail_tests: List[str],
|
||||||
pass_tests: List[str]
|
pass_tests: List[str]
|
||||||
) -> Optional[str]:
|
) -> Optional[str]:
|
||||||
"""Run ra-aid on the problem statement, returning a generated patch if possible."""
|
"""
|
||||||
# Create prompt
|
Run ra-aid on the problem statement (using the local venv), returning a generated patch if possible.
|
||||||
|
"""
|
||||||
prompt = f"{problem_statement}\n\nTests that need to be fixed:\n```\n"
|
prompt = f"{problem_statement}\n\nTests that need to be fixed:\n```\n"
|
||||||
for t in fail_tests:
|
for t in fail_tests:
|
||||||
prompt += f"- {t}\n"
|
prompt += f"- {t}\n"
|
||||||
|
|
@ -89,9 +226,10 @@ def run_raaid(
|
||||||
prompt += f"- {t}\n"
|
prompt += f"- {t}\n"
|
||||||
prompt += "```\n\n"
|
prompt += "```\n\n"
|
||||||
|
|
||||||
# Implementation phase
|
# Use ra-aid from the local venv
|
||||||
|
raaid_exe = venv_dir / "bin" / "ra-aid"
|
||||||
impl_cmd = [
|
impl_cmd = [
|
||||||
'ra-aid',
|
str(raaid_exe),
|
||||||
'--cowboy-mode',
|
'--cowboy-mode',
|
||||||
'-m', prompt,
|
'-m', prompt,
|
||||||
]
|
]
|
||||||
|
|
@ -116,7 +254,6 @@ def run_raaid(
|
||||||
logging.error(f"ra-aid error: {e}")
|
logging.error(f"ra-aid error: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Collect patch
|
|
||||||
repo = Repo(repo_dir)
|
repo = Repo(repo_dir)
|
||||||
patch = get_git_patch(repo)
|
patch = get_git_patch(repo)
|
||||||
return patch
|
return patch
|
||||||
|
|
@ -139,11 +276,17 @@ def get_git_patch(repo: Repo) -> Optional[str]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def process_instance(instance: Dict[str, Any], output_repo_dir: Path) -> Dict[str, Any]:
|
def process_instance(
|
||||||
|
instance: Dict[str, Any],
|
||||||
|
projects_dir: Path,
|
||||||
|
cleanup: bool
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Process a single dataset instance:
|
Process a single dataset instance:
|
||||||
- Clone the repo
|
- Clone the repo into projects_dir/<instance_id>
|
||||||
- Checkout commit
|
- Checkout commit
|
||||||
|
- Build a local Python venv in that repo (checking override dict)
|
||||||
|
- Install ra-aid + any project dependencies
|
||||||
- Build prompt from problem_statement, FAIL_TO_PASS, PASS_TO_PASS
|
- Build prompt from problem_statement, FAIL_TO_PASS, PASS_TO_PASS
|
||||||
- Return dict in required format:
|
- Return dict in required format:
|
||||||
{
|
{
|
||||||
|
|
@ -151,6 +294,7 @@ def process_instance(instance: Dict[str, Any], output_repo_dir: Path) -> Dict[st
|
||||||
"model_patch": ...,
|
"model_patch": ...,
|
||||||
"model_name_or_path": ...
|
"model_name_or_path": ...
|
||||||
}
|
}
|
||||||
|
- If cleanup is True, remove the cloned repo after generating a patch
|
||||||
"""
|
"""
|
||||||
inst_id = instance.get("instance_id", "<unknown>")
|
inst_id = instance.get("instance_id", "<unknown>")
|
||||||
repo_name = instance["repo"]
|
repo_name = instance["repo"]
|
||||||
|
|
@ -159,37 +303,53 @@ def process_instance(instance: Dict[str, Any], output_repo_dir: Path) -> Dict[st
|
||||||
fail_tests = instance.get("FAIL_TO_PASS", [])
|
fail_tests = instance.get("FAIL_TO_PASS", [])
|
||||||
pass_tests = instance.get("PASS_TO_PASS", [])
|
pass_tests = instance.get("PASS_TO_PASS", [])
|
||||||
|
|
||||||
# Convert to lists if they're strings
|
|
||||||
if isinstance(fail_tests, str):
|
if isinstance(fail_tests, str):
|
||||||
fail_tests = [fail_tests]
|
fail_tests = [fail_tests]
|
||||||
if isinstance(pass_tests, str):
|
if isinstance(pass_tests, str):
|
||||||
pass_tests = [pass_tests]
|
pass_tests = [pass_tests]
|
||||||
|
|
||||||
# Attempt to build a github url if not provided
|
# Build GitHub URL
|
||||||
# If 'repo' is "org/repo", create https://github.com/org/repo.git
|
|
||||||
if "github.com" not in repo_name:
|
if "github.com" not in repo_name:
|
||||||
repo_url = f"https://github.com/{repo_name}.git"
|
repo_url = f"https://github.com/{repo_name}.git"
|
||||||
else:
|
else:
|
||||||
repo_url = repo_name
|
repo_url = repo_name
|
||||||
|
|
||||||
|
checkout_dir = projects_dir / f"{inst_id}"
|
||||||
patch_str = None
|
patch_str = None
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
|
||||||
tmp_path = Path(tmp)
|
|
||||||
try:
|
try:
|
||||||
# Clone & checkout
|
if checkout_dir.exists():
|
||||||
repo = Repo.clone_from(repo_url, tmp_path)
|
logging.info(f"Removing pre-existing directory: {checkout_dir}")
|
||||||
|
shutil.rmtree(checkout_dir)
|
||||||
|
|
||||||
|
# Clone and checkout
|
||||||
|
repo = Repo.clone_from(repo_url, checkout_dir)
|
||||||
repo.git.checkout(commit)
|
repo.git.checkout(commit)
|
||||||
|
|
||||||
|
# Set up local Python venv & install dependencies
|
||||||
|
venv_dir = setup_repo_venv(checkout_dir, repo_name=repo_name)
|
||||||
|
|
||||||
|
# Run ra-aid
|
||||||
|
patch_str = run_raaid(
|
||||||
|
checkout_dir,
|
||||||
|
venv_dir,
|
||||||
|
problem_statement,
|
||||||
|
fail_tests,
|
||||||
|
pass_tests
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to clone/check out {repo_url}:{commit} - {e}")
|
logging.error(f"Failed to process {repo_url}:{commit} - {e}")
|
||||||
return {
|
return {
|
||||||
"instance_id": inst_id,
|
"instance_id": inst_id,
|
||||||
"model_patch": "",
|
"model_patch": "",
|
||||||
"model_name_or_path": "ra-aid"
|
"model_name_or_path": "ra-aid"
|
||||||
}
|
}
|
||||||
# Run ra-aid
|
finally:
|
||||||
patch_str = run_raaid(tmp_path, problem_statement, fail_tests, pass_tests)
|
if cleanup:
|
||||||
|
logging.info(f"Cleaning up directory: {checkout_dir}")
|
||||||
|
shutil.rmtree(checkout_dir, ignore_errors=True)
|
||||||
|
|
||||||
# Return required prediction structure
|
|
||||||
return {
|
return {
|
||||||
"instance_id": inst_id,
|
"instance_id": inst_id,
|
||||||
"model_patch": patch_str if patch_str else "",
|
"model_patch": patch_str if patch_str else "",
|
||||||
|
|
@ -217,17 +377,30 @@ def main() -> None:
|
||||||
default=None,
|
default=None,
|
||||||
help="Number of instances to process (default: all)"
|
help="Number of instances to process (default: all)"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--projects-dir",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="Directory where projects will be cloned. Must exist or can be created."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--cleanup",
|
||||||
|
action="store_true",
|
||||||
|
help="If set, remove the cloned repos after generating the patch."
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
base_dir, log_dir = create_output_dirs()
|
base_dir, log_dir = create_output_dirs()
|
||||||
setup_logging(log_dir, args.verbose)
|
setup_logging(log_dir, args.verbose)
|
||||||
logging.info("Starting script")
|
logging.info("Starting script")
|
||||||
|
|
||||||
|
args.projects_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
dataset = load_dataset_safely()
|
dataset = load_dataset_safely()
|
||||||
if dataset is None:
|
if dataset is None:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Combine "dev" and "test" splits (no "train" in this dataset)
|
# Combine 'dev' and 'test' splits for this dataset (there is no 'train')
|
||||||
all_data = list(dataset["dev"]) + list(dataset["test"])
|
all_data = list(dataset["dev"]) + list(dataset["test"])
|
||||||
|
|
||||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
@ -247,7 +420,11 @@ def main() -> None:
|
||||||
if i >= limit:
|
if i >= limit:
|
||||||
break
|
break
|
||||||
try:
|
try:
|
||||||
pred = process_instance(inst, args.output_dir)
|
pred = process_instance(
|
||||||
|
inst,
|
||||||
|
projects_dir=args.projects_dir,
|
||||||
|
cleanup=args.cleanup
|
||||||
|
)
|
||||||
predictions.append(pred)
|
predictions.append(pred)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Error processing instance: {inst.get('instance_id', '')} - {e}")
|
logging.error(f"Error processing instance: {inst.get('instance_id', '')} - {e}")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue