diff --git a/_posts/2024-05-22-swe-bench-lite.md b/_posts/2024-05-22-swe-bench-lite.md index 4021415f4..caa72c11f 100644 --- a/_posts/2024-05-22-swe-bench-lite.md +++ b/_posts/2024-05-22-swe-bench-lite.md @@ -1,12 +1,15 @@ --- title: Aider scores SOTA 26.3% on SWE Bench Lite excerpt: Aider scored 26.3% on SWE Bench Lite, achieving a state of the art result. +highlight_image: /assets/swe_bench_lite.jpg draft: true --- +[![SWE Bench Lite results](/assets/swe_bench_lite.svg)](https://aider.chat/assets/swe_bench_lite.svg) + # Aider scores SOTA 26.3% on SWE Bench Lite -[Aider scored 26.3%]() +Aider scored 26.3% on the [SWE Bench Lite benchmark](https://www.swebench.com), achieving a state of the art result. The current top leaderboard entry is 20.33% @@ -14,6 +17,8 @@ from Amazon Q Developer Agent. The best result reported elsewhere online seems to be [22.3% from AutoCodeRover](https://github.com/nus-apr/auto-code-rover). +## Interactive, not agentic + Aider achieved this result mainly through its focus on static code analysis, reliable LLM code editing and pragmatic workflows for interactive pair programming with AI. @@ -33,6 +38,8 @@ When a user asks aider for a change, they see the edits performed in real-time. Aider may also then offer additional help like fixing lint or test errors. +## Methodology + For the benchmark, aider was launched in each problem's git repository with the problem statement @@ -113,7 +120,7 @@ Some noteworthy observations: | 6 | Opus | 1 | 1.3 | 100.0 |**Total**| | **79** | **100%** | **100%** | -If we just look at which models produced correct solutions, +If we breakdown correct solutions purely by model, we can see that GPT-4o dominates. This isn't a fair comparison, because GPT-4o always took the first attempt at solving. @@ -145,8 +152,7 @@ to provide a compact and powerful summary of the entire code base. The map is constantly tailored to show repo context that is relevant to the current state of the chat conversation. - -by performing a graph optimization on the code's call graph. +This is done by performing a graph optimization on the code's call graph. When the user asks for a change to their code, the LLM uses the repo map to decide which files to edit. diff --git a/assets/swe_bench_lite.jpg b/assets/swe_bench_lite.jpg new file mode 100644 index 000000000..32e2abf4c Binary files /dev/null and b/assets/swe_bench_lite.jpg differ diff --git a/assets/swe_bench_lite.svg b/assets/swe_bench_lite.svg new file mode 100644 index 000000000..4da3fc0fd --- /dev/null +++ b/assets/swe_bench_lite.svg @@ -0,0 +1,1632 @@ + + + + + + + + 2024-05-22T15:20:34.149598 + image/svg+xml + + + Matplotlib v3.9.0, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmark/swe_bench_lite.py b/benchmark/swe_bench_lite.py index 3a99428ae..cdbbf8227 100644 --- a/benchmark/swe_bench_lite.py +++ b/benchmark/swe_bench_lite.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt -from matplotlib import rc from imgcat import imgcat +from matplotlib import rc + def plot_swe_bench_lite(data_file): with open(data_file, "r") as file: @@ -22,30 +23,37 @@ def plot_swe_bench_lite(data_file): rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10}) - fig, ax = plt.subplots(figsize=(10, 5)) + fig, ax = plt.subplots(figsize=(10, 6)) ax.grid(axis="y", zorder=0, lw=0.2) for spine in ax.spines.values(): spine.set_edgecolor("#DDDDDD") spine.set_linewidth(0.5) - colors = [ - "#b3e6a8" if "Aider" in model else "#b3d1e6" for model in models - ] + colors = ["#b3e6a8" if "Aider" in model else "#b3d1e6" for model in models] bars = ax.bar(models, pass_rates, color=colors, alpha=0.75, zorder=3) for bar in bars: yval = bar.get_height() - ax.text(bar.get_x() + bar.get_width()/2, yval + 0.5, f'{yval}%', ha='center', va='bottom', fontsize=12, alpha=0.75) + ax.text( + bar.get_x() + bar.get_width() / 2, + yval + 0.5, + f"{yval}%", + ha="center", + va="bottom", + fontsize=12, + alpha=0.75, + ) - #ax.set_xlabel("Models", fontsize=18) + # ax.set_xlabel("Models", fontsize=18) ax.set_ylabel("Pass rate (%)", fontsize=18) - ax.set_title("SWE Bench Lite pass rates", fontsize=20) + ax.set_title("SWE Bench Lite", fontsize=20) ax.set_ylim(0, 30) - plt.xticks(rotation=45, ha='right', fontsize=16) + plt.xticks(rotation=45, ha="right", fontsize=16) plt.tight_layout(pad=3.0) - plt.savefig("swe_bench_lite.png") + plt.savefig("swe_bench_lite.jpg") plt.savefig("swe_bench_lite.svg") imgcat(fig) + # Example usage plot_swe_bench_lite("benchmark/tmp.txt")