mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-30 09:14:59 +00:00
copy
This commit is contained in:
parent
19e7823859
commit
0b01b7caf5
4 changed files with 1660 additions and 14 deletions
|
@ -1,12 +1,15 @@
|
||||||
---
|
---
|
||||||
title: Aider scores SOTA 26.3% on SWE Bench Lite
|
title: Aider scores SOTA 26.3% on SWE Bench Lite
|
||||||
excerpt: Aider scored 26.3% on SWE Bench Lite, achieving a state of the art result.
|
excerpt: Aider scored 26.3% on SWE Bench Lite, achieving a state of the art result.
|
||||||
|
highlight_image: /assets/swe_bench_lite.jpg
|
||||||
draft: true
|
draft: true
|
||||||
---
|
---
|
||||||
|
|
||||||
|
[](https://aider.chat/assets/swe_bench_lite.svg)
|
||||||
|
|
||||||
# Aider scores SOTA 26.3% on SWE Bench Lite
|
# Aider scores SOTA 26.3% on SWE Bench Lite
|
||||||
|
|
||||||
[Aider scored 26.3%]()
|
Aider scored 26.3%
|
||||||
on the
|
on the
|
||||||
[SWE Bench Lite benchmark](https://www.swebench.com), achieving a state of the art result.
|
[SWE Bench Lite benchmark](https://www.swebench.com), achieving a state of the art result.
|
||||||
The current top leaderboard entry is 20.33%
|
The current top leaderboard entry is 20.33%
|
||||||
|
@ -14,6 +17,8 @@ from Amazon Q Developer Agent.
|
||||||
The best result reported elsewhere online seems to be
|
The best result reported elsewhere online seems to be
|
||||||
[22.3% from AutoCodeRover](https://github.com/nus-apr/auto-code-rover).
|
[22.3% from AutoCodeRover](https://github.com/nus-apr/auto-code-rover).
|
||||||
|
|
||||||
|
## Interactive, not agentic
|
||||||
|
|
||||||
Aider achieved this result mainly through its focus on static code analysis,
|
Aider achieved this result mainly through its focus on static code analysis,
|
||||||
reliable LLM code editing
|
reliable LLM code editing
|
||||||
and pragmatic workflows for interactive pair programming with AI.
|
and pragmatic workflows for interactive pair programming with AI.
|
||||||
|
@ -33,6 +38,8 @@ When a user asks aider for a change, they see the edits performed in real-time.
|
||||||
Aider may also then offer additional
|
Aider may also then offer additional
|
||||||
help like fixing lint or test errors.
|
help like fixing lint or test errors.
|
||||||
|
|
||||||
|
## Methodology
|
||||||
|
|
||||||
For the benchmark,
|
For the benchmark,
|
||||||
aider was launched in each problem's git repository
|
aider was launched in each problem's git repository
|
||||||
with the problem statement
|
with the problem statement
|
||||||
|
@ -113,7 +120,7 @@ Some noteworthy observations:
|
||||||
| 6 | Opus | 1 | 1.3 | 100.0
|
| 6 | Opus | 1 | 1.3 | 100.0
|
||||||
|**Total**| | **79** | **100%** | **100%** |
|
|**Total**| | **79** | **100%** | **100%** |
|
||||||
|
|
||||||
If we just look at which models produced correct solutions,
|
If we breakdown correct solutions purely by model,
|
||||||
we can see that GPT-4o dominates.
|
we can see that GPT-4o dominates.
|
||||||
This isn't a fair comparison, because GPT-4o always took the first
|
This isn't a fair comparison, because GPT-4o always took the first
|
||||||
attempt at solving.
|
attempt at solving.
|
||||||
|
@ -145,8 +152,7 @@ to provide a compact and powerful summary of the entire code base.
|
||||||
The map is constantly
|
The map is constantly
|
||||||
tailored to show
|
tailored to show
|
||||||
repo context that is relevant to the current state of the chat conversation.
|
repo context that is relevant to the current state of the chat conversation.
|
||||||
|
This is done by performing a graph optimization on the code's call graph.
|
||||||
by performing a graph optimization on the code's call graph.
|
|
||||||
|
|
||||||
When the user asks for a change to their code, the LLM uses the repo map
|
When the user asks for a change to their code, the LLM uses the repo map
|
||||||
to decide which files to edit.
|
to decide which files to edit.
|
||||||
|
|
BIN
assets/swe_bench_lite.jpg
Normal file
BIN
assets/swe_bench_lite.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 36 KiB |
1632
assets/swe_bench_lite.svg
Normal file
1632
assets/swe_bench_lite.svg
Normal file
File diff suppressed because it is too large
Load diff
After Width: | Height: | Size: 39 KiB |
|
@ -1,6 +1,7 @@
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
from matplotlib import rc
|
|
||||||
from imgcat import imgcat
|
from imgcat import imgcat
|
||||||
|
from matplotlib import rc
|
||||||
|
|
||||||
|
|
||||||
def plot_swe_bench_lite(data_file):
|
def plot_swe_bench_lite(data_file):
|
||||||
with open(data_file, "r") as file:
|
with open(data_file, "r") as file:
|
||||||
|
@ -22,30 +23,37 @@ def plot_swe_bench_lite(data_file):
|
||||||
|
|
||||||
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
|
rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(10, 5))
|
fig, ax = plt.subplots(figsize=(10, 6))
|
||||||
ax.grid(axis="y", zorder=0, lw=0.2)
|
ax.grid(axis="y", zorder=0, lw=0.2)
|
||||||
for spine in ax.spines.values():
|
for spine in ax.spines.values():
|
||||||
spine.set_edgecolor("#DDDDDD")
|
spine.set_edgecolor("#DDDDDD")
|
||||||
spine.set_linewidth(0.5)
|
spine.set_linewidth(0.5)
|
||||||
|
|
||||||
colors = [
|
colors = ["#b3e6a8" if "Aider" in model else "#b3d1e6" for model in models]
|
||||||
"#b3e6a8" if "Aider" in model else "#b3d1e6" for model in models
|
|
||||||
]
|
|
||||||
bars = ax.bar(models, pass_rates, color=colors, alpha=0.75, zorder=3)
|
bars = ax.bar(models, pass_rates, color=colors, alpha=0.75, zorder=3)
|
||||||
|
|
||||||
for bar in bars:
|
for bar in bars:
|
||||||
yval = bar.get_height()
|
yval = bar.get_height()
|
||||||
ax.text(bar.get_x() + bar.get_width()/2, yval + 0.5, f'{yval}%', ha='center', va='bottom', fontsize=12, alpha=0.75)
|
ax.text(
|
||||||
|
bar.get_x() + bar.get_width() / 2,
|
||||||
|
yval + 0.5,
|
||||||
|
f"{yval}%",
|
||||||
|
ha="center",
|
||||||
|
va="bottom",
|
||||||
|
fontsize=12,
|
||||||
|
alpha=0.75,
|
||||||
|
)
|
||||||
|
|
||||||
#ax.set_xlabel("Models", fontsize=18)
|
# ax.set_xlabel("Models", fontsize=18)
|
||||||
ax.set_ylabel("Pass rate (%)", fontsize=18)
|
ax.set_ylabel("Pass rate (%)", fontsize=18)
|
||||||
ax.set_title("SWE Bench Lite pass rates", fontsize=20)
|
ax.set_title("SWE Bench Lite", fontsize=20)
|
||||||
ax.set_ylim(0, 30)
|
ax.set_ylim(0, 30)
|
||||||
plt.xticks(rotation=45, ha='right', fontsize=16)
|
plt.xticks(rotation=45, ha="right", fontsize=16)
|
||||||
plt.tight_layout(pad=3.0)
|
plt.tight_layout(pad=3.0)
|
||||||
plt.savefig("swe_bench_lite.png")
|
plt.savefig("swe_bench_lite.jpg")
|
||||||
plt.savefig("swe_bench_lite.svg")
|
plt.savefig("swe_bench_lite.svg")
|
||||||
imgcat(fig)
|
imgcat(fig)
|
||||||
|
|
||||||
|
|
||||||
# Example usage
|
# Example usage
|
||||||
plot_swe_bench_lite("benchmark/tmp.txt")
|
plot_swe_bench_lite("benchmark/tmp.txt")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue