RA.Aid/ra_aid/file_listing.py

248 lines
8.4 KiB
Python

"""Module for efficient file listing using git."""
import os
import subprocess
from pathlib import Path
from typing import List, Optional, Tuple
import fnmatch
class FileListerError(Exception):
"""Base exception for file listing related errors."""
pass
class GitCommandError(FileListerError):
"""Raised when a git command fails."""
pass
class DirectoryNotFoundError(FileListerError):
"""Raised when the specified directory does not exist."""
pass
class DirectoryAccessError(FileListerError):
"""Raised when the directory cannot be accessed due to permissions."""
pass
def is_git_repo(directory: str) -> bool:
"""
Check if the given directory is a git repository.
Args:
directory: Path to the directory to check
Returns:
bool: True if directory is a git repository, False otherwise
Raises:
DirectoryNotFoundError: If directory does not exist
DirectoryAccessError: If directory cannot be accessed
GitCommandError: If git command fails unexpectedly
"""
try:
path = Path(directory).resolve()
if not path.exists():
raise DirectoryNotFoundError(f"Directory does not exist: {directory}")
if not path.is_dir():
raise DirectoryNotFoundError(f"Path is not a directory: {directory}")
result = subprocess.run(
["git", "rev-parse", "--git-dir"],
cwd=str(path),
capture_output=True,
text=True,
)
return result.returncode == 0
except subprocess.CalledProcessError as e:
raise GitCommandError(f"Git command failed: {e}")
except PermissionError as e:
raise DirectoryAccessError(f"Cannot access directory {directory}: {e}")
except Exception as e:
if isinstance(e, FileListerError):
raise
raise FileListerError(f"Error checking git repository: {e}")
def get_all_project_files(
directory: str, include_hidden: bool = False, exclude_patterns: Optional[List[str]] = None
) -> List[str]:
"""
Get a list of all files in a project directory, handling both git and non-git repositories.
Args:
directory: Path to the directory
include_hidden: Whether to include hidden files (starting with .) in the results
exclude_patterns: Optional list of patterns to exclude from the results
Returns:
List[str]: List of file paths relative to the directory
Raises:
DirectoryNotFoundError: If directory does not exist
DirectoryAccessError: If directory cannot be accessed
GitCommandError: If git command fails
FileListerError: For other unexpected errors
"""
# Check if directory exists and is accessible
if not os.path.exists(directory):
raise DirectoryNotFoundError(f"Directory not found: {directory}")
if not os.path.isdir(directory):
raise DirectoryNotFoundError(f"Not a directory: {directory}")
# Default excluded directories
excluded_dirs = {'.ra-aid', '.venv', '.git', '.aider', '__pycache__'}
# Check if it's a git repository
try:
is_git = is_git_repo(directory)
except FileListerError:
# If checking fails, default to non-git approach
is_git = False
all_files = []
if is_git:
# Get list of files from git ls-files
try:
# Get both tracked and untracked files
tracked_files_process = subprocess.run(
["git", "ls-files"],
cwd=directory,
capture_output=True,
text=True,
check=True,
)
untracked_files_process = subprocess.run(
["git", "ls-files", "--others", "--exclude-standard"],
cwd=directory,
capture_output=True,
text=True,
check=True,
)
except subprocess.CalledProcessError as e:
raise GitCommandError(f"Git command failed: {e}")
except PermissionError as e:
raise DirectoryAccessError(f"Permission denied: {e}")
# Combine and process the files
for file in (
tracked_files_process.stdout.splitlines()
+ untracked_files_process.stdout.splitlines()
):
file = file.strip()
if not file:
continue
# Skip hidden files unless explicitly included
if not include_hidden and (
file.startswith(".")
or any(part.startswith(".") for part in file.split("/"))
):
continue
# Skip .aider files
if ".aider" in file:
continue
all_files.append(file)
else:
# Not a git repository, use manual file listing
base_path = Path(directory)
# First check if we can access the directory (check exists and isdir already done above)
try:
# We already verified existence, just check for permission errors
# Handle potential FileNotFoundError for mock tests
try:
os.listdir(directory)
except FileNotFoundError:
# This should normally not happen as we checked existence above
# But it can happen in mock tests
pass
except PermissionError as e:
raise DirectoryAccessError(f"Cannot access directory {directory}: {e}")
try:
for root, dirs, files in os.walk(directory):
# Filter out excluded directories
dirs[:] = [d for d in dirs if d not in excluded_dirs and (include_hidden or not d.startswith('.'))]
# Calculate relative path
rel_root = os.path.relpath(root, directory)
if rel_root == '.':
rel_root = ''
# Process files
for file in files:
# Skip hidden files unless explicitly included
if not include_hidden and file.startswith('.'):
continue
# Create relative path
rel_path = os.path.join(rel_root, file) if rel_root else file
all_files.append(rel_path)
except PermissionError as e:
raise DirectoryAccessError(f"Permission denied while walking directory {directory}: {e}")
# Apply additional exclude patterns if specified
if exclude_patterns:
for pattern in exclude_patterns:
all_files = [f for f in all_files if not fnmatch.fnmatch(f, pattern)]
# Remove duplicates and sort
return sorted(set(all_files))
def get_file_listing(
directory: str, limit: Optional[int] = None, include_hidden: bool = False
) -> Tuple[List[str], int]:
"""
Get a list of files in a directory.
For git repositories, uses `git ls-files` for efficient file listing that respects .gitignore rules.
For non-git directories, falls back to manual file listing using Python's standard library.
Returns a tuple containing the list of files (truncated if limit is specified)
and the total count of files.
Args:
directory: Path to the directory
limit: Optional maximum number of files to return
include_hidden: Whether to include hidden files (starting with .) in the results
Returns:
Tuple[List[str], int]: Tuple containing:
- List of file paths (truncated to limit if specified)
- Total number of files (before truncation)
Raises:
DirectoryNotFoundError: If directory does not exist
DirectoryAccessError: If directory cannot be accessed
GitCommandError: If git command fails
FileListerError: For other unexpected errors
"""
try:
# Use the common function to get all files
all_files = get_all_project_files(directory, include_hidden)
# Get total count before truncation
total_count = len(all_files)
# Apply limit if specified
if limit is not None:
all_files = all_files[:limit]
return all_files, total_count
except (DirectoryNotFoundError, DirectoryAccessError, GitCommandError):
# Re-raise known exceptions
raise
except PermissionError as e:
raise DirectoryAccessError(f"Permission denied: {e}")
except Exception as e:
raise FileListerError(f"Unexpected error: {e}")