refactor: Extract code contribution stats logic to separate module

This commit is contained in:
josix 2025-01-17 01:27:49 +08:00
parent 3d4fb68172
commit 3b40edec93
No known key found for this signature in database
GPG key ID: EF4F614562B02881
3 changed files with 109 additions and 71 deletions

View file

@ -4,7 +4,7 @@ import re
import subprocess import subprocess
import sys import sys
import tempfile import tempfile
from collections import OrderedDict from collections import OrderedDict, defaultdict
from os.path import expanduser from os.path import expanduser
from pathlib import Path from pathlib import Path
@ -23,6 +23,7 @@ from aider.repo import ANY_GIT_ERROR
from aider.run_cmd import run_cmd from aider.run_cmd import run_cmd
from aider.scrape import Scraper, install_playwright from aider.scrape import Scraper, install_playwright
from aider.utils import is_image_file from aider.utils import is_image_file
from aider.stats import hash_len, get_all_commit_hashes_between_tags, get_commit_authors, get_counts_for_file
from .dump import dump # noqa: F401 from .dump import dump # noqa: F401
@ -1520,6 +1521,9 @@ class Commands:
self.io.tool_error("No main or master branch found. Please specify a revision.") self.io.tool_error("No main or master branch found. Please specify a revision.")
return return
source_revision, target_revision = args.split("..") if ".." in args else (args, "HEAD") source_revision, target_revision = args.split("..") if ".." in args else (args, "HEAD")
commits = get_all_commit_hashes_between_tags(source_revision, target_revision)
commits = [commit[:hash_len] for commit in commits] if commits else []
authors = get_commit_authors(commits)
# Get files changed between revisions # Get files changed between revisions
diff_files = self.coder.repo.repo.git.diff( diff_files = self.coder.repo.repo.git.diff(
@ -1535,49 +1539,29 @@ class Commands:
'.ttf', '.otf', '.woff', '.woff2', '.eot' # fonts '.ttf', '.otf', '.woff', '.woff2', '.eot' # fonts
))] ))]
self.io.tool_output(f"Found {len(files)} non-binary tracked files in the repository.") self.io.tool_output(f"Found {len(files)} non-binary tracked files in the repository.")
total_lines = 0 all_file_counts = {}
aider_lines = 0 grand_total = defaultdict(int)
aider_total = 0
for file in files: for file in files:
try: file_counts = get_counts_for_file(source_revision, target_revision, authors, file)
# Run git blame for each file if file_counts:
blame_output = self.coder.repo.repo.git.blame( all_file_counts[file] = file_counts
f"{source_revision}..{target_revision}", "-M", "-C", "--line-porcelain", "--", file for author, count in file_counts.items():
) grand_total[author] += count
if "(aider)" in author.lower():
# Parse blame output aider_total += count
for line in blame_output.split('filename'): total_lines = sum(grand_total.values())
total_lines += 1 aider_percentage = (aider_total / total_lines) * 100 if total_lines > 0 else 0
for field in line.split('\n'):
# Check author and committer lines for aider attribution
author_match = False
committer_match = False
if field.startswith("author ") or field.startswith("committer "):
author_match = "(aider)" in field.lower()
committer_match = "(aider)" in field.lower()
if author_match or committer_match:
aider_lines += 1
except Exception as e:
if "no such path" not in str(e).lower():
self.io.tool_error(f"Error processing {file}: {e}")
# Calculate percentages # Calculate percentages
if total_lines > 0: if total_lines > 0:
aider_percentage = (aider_lines / total_lines) * 100 # Output overall statistics
human_lines = total_lines - aider_lines self.io.tool_output(f"\nAnalysis from {source_revision} to {target_revision}:")
human_percentage = (human_lines / total_lines) * 100 self.io.tool_output(f"Total lines analyzed: {total_lines:,}")
self.io.tool_output(f"Lines by aider: {aider_total:,} ({aider_percentage:.1f}%)")
self.io.tool_output(f"Lines by humans: {total_lines - aider_total:,} ({100 - aider_percentage:.1f}%)")
# Display results
self.io.tool_output("\nCode contribution statistics:")
self.io.tool_output(f"Total lines of code: {total_lines:,}")
self.io.tool_output(
f"Human-written code: {human_lines:,} lines ({human_percentage:.1f}%)"
)
self.io.tool_output(
f"Aider-written code: {aider_lines:,} lines ({aider_percentage:.1f}%)"
)
else: else:
self.io.tool_output("No lines of code found in the repository.") self.io.tool_output("No lines of code found in the repository.")

84
aider/stats.py Normal file
View file

@ -0,0 +1,84 @@
import subprocess
import sys
from collections import defaultdict
hash_len = len("44e6fefc2")
def run(cmd):
# Get all commit hashes since the specified tag
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
return result.stdout
def get_all_commit_hashes_between_tags(start_tag, end_tag=None):
if end_tag:
res = run(["git", "rev-list", f"{start_tag}..{end_tag}"])
else:
res = run(["git", "rev-list", f"{start_tag}..HEAD"])
if res:
commit_hashes = res.strip().split("\n")
return commit_hashes
def get_commit_authors(commits):
commit_to_author = dict()
for commit in commits:
author = run(["git", "show", "-s", "--format=%an", commit]).strip()
commit_message = run(["git", "show", "-s", "--format=%s", commit]).strip()
if commit_message.lower().startswith("aider:"):
author += " (aider)"
commit_to_author[commit] = author
return commit_to_author
def get_counts_for_file(start_tag, end_tag, authors, fname):
try:
if end_tag:
text = run(
[
"git",
"blame",
"-M100", # Detect moved lines within a file with 100% similarity
"-C100", # Detect moves across files with 100% similarity
"-C", # Increase detection effort
"-C", # Increase detection effort even more
"--abbrev=9",
f"{start_tag}..{end_tag}",
"--",
fname,
]
)
else:
text = run(
[
"git",
"blame",
"-M100", # Detect moved lines within a file with 100% similarity
"-C100", # Detect moves across files with 100% similarity
"-C", # Increase detection effort
"-C", # Increase detection effort even more
"--abbrev=9",
f"{start_tag}..HEAD",
"--",
fname,
]
)
if not text:
return None
text = text.splitlines()
line_counts = defaultdict(int)
for line in text:
if line.startswith("^"):
continue
hsh = line[:hash_len]
author = authors.get(hsh, "Unknown")
line_counts[author] += 1
return dict(line_counts)
except subprocess.CalledProcessError as e:
if "no such path" in str(e).lower():
# File doesn't exist in this revision range, which is okay
return None
else:
# Some other error occurred
print(f"Warning: Unable to blame file {fname}. Error: {e}", file=sys.stderr)
return None

View file

@ -2,8 +2,6 @@
import argparse import argparse
import os import os
import subprocess
import sys
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from operator import itemgetter from operator import itemgetter
@ -12,6 +10,8 @@ import semver
import yaml import yaml
from tqdm import tqdm from tqdm import tqdm
from aider.stats import get_all_commit_hashes_between_tags, run, hash_len, get_commit_authors, get_counts_for_file
website_files = [ website_files = [
"aider/website/index.html", "aider/website/index.html",
"aider/website/share/index.md", "aider/website/share/index.md",
@ -68,36 +68,6 @@ def blame(start_tag, end_tag=None):
return all_file_counts, grand_total, total_lines, aider_total, aider_percentage, end_date return all_file_counts, grand_total, total_lines, aider_total, aider_percentage, end_date
def get_all_commit_hashes_between_tags(start_tag, end_tag=None):
if end_tag:
res = run(["git", "rev-list", f"{start_tag}..{end_tag}"])
else:
res = run(["git", "rev-list", f"{start_tag}..HEAD"])
if res:
commit_hashes = res.strip().split("\n")
return commit_hashes
def run(cmd):
# Get all commit hashes since the specified tag
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
return result.stdout
def get_commit_authors(commits):
commit_to_author = dict()
for commit in commits:
author = run(["git", "show", "-s", "--format=%an", commit]).strip()
commit_message = run(["git", "show", "-s", "--format=%s", commit]).strip()
if commit_message.lower().startswith("aider:"):
author += " (aider)"
commit_to_author[commit] = author
return commit_to_author
hash_len = len("44e6fefc2")
def process_all_tags_since(start_tag): def process_all_tags_since(start_tag):
tags = get_all_tags_since(start_tag) tags = get_all_tags_since(start_tag)