mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-25 23:05:00 +00:00
run repeats
This commit is contained in:
parent
92a09f3deb
commit
411c744a15
2 changed files with 41 additions and 35 deletions
|
@ -9,45 +9,50 @@ from aider.dump import dump # noqa: F401
|
||||||
def main():
|
def main():
|
||||||
models = [
|
models = [
|
||||||
# "gpt-3.5-turbo-0301",
|
# "gpt-3.5-turbo-0301",
|
||||||
# "gpt-3.5-turbo-0613",
|
"gpt-3.5-turbo-0613",
|
||||||
# "gpt-3.5-turbo-16k-0613",
|
# "gpt-3.5-turbo-16k-0613",
|
||||||
"gpt-4-0314",
|
# "gpt-4-0314",
|
||||||
"gpt-4-0613",
|
# "gpt-4-0613",
|
||||||
]
|
]
|
||||||
edit_formats = [
|
edit_formats = [
|
||||||
"diff",
|
# "diff",
|
||||||
"diff-func",
|
# "diff-func",
|
||||||
"whole",
|
"whole",
|
||||||
"whole-func",
|
# "whole-func",
|
||||||
]
|
]
|
||||||
|
|
||||||
for model in models:
|
for repeat in range(1, 10, 1):
|
||||||
for edit_format in edit_formats:
|
for model in models:
|
||||||
# dump(model, edit_format)
|
for edit_format in edit_formats:
|
||||||
|
# dump(model, edit_format)
|
||||||
|
|
||||||
if "-func" in edit_format and "-03" in model:
|
if "-func" in edit_format and "-03" in model:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if (model, edit_format) == ("gpt-3.5-turbo-16k-0613", "whole-func"):
|
# if (model, edit_format) == ("gpt-3.5-turbo-16k-0613", "whole-func"):
|
||||||
# sublist reliably hangs the API?
|
# # sublist reliably hangs the API?
|
||||||
continue
|
# continue
|
||||||
|
|
||||||
dirname = f"rungrid-{model}-{edit_format}"
|
# dirname = f"rungrid-{model}-{edit_format}"
|
||||||
|
dirname = f"rungrid-{model}-{edit_format}-repeat-{repeat}"
|
||||||
|
run(dirname, model, edit_format)
|
||||||
|
|
||||||
cmd = [
|
|
||||||
"./benchmark/benchmark.py",
|
|
||||||
dirname,
|
|
||||||
"--model",
|
|
||||||
model,
|
|
||||||
"--edit-format",
|
|
||||||
edit_format,
|
|
||||||
"--threads",
|
|
||||||
"10",
|
|
||||||
"--cont",
|
|
||||||
]
|
|
||||||
print(" ".join(cmd))
|
|
||||||
|
|
||||||
subprocess.run(cmd, check=True)
|
def run(dirname, model, edit_format):
|
||||||
|
cmd = [
|
||||||
|
"./benchmark/benchmark.py",
|
||||||
|
dirname,
|
||||||
|
"--model",
|
||||||
|
model,
|
||||||
|
"--edit-format",
|
||||||
|
edit_format,
|
||||||
|
"--threads",
|
||||||
|
"10",
|
||||||
|
"--cont",
|
||||||
|
]
|
||||||
|
print(" ".join(cmd))
|
||||||
|
|
||||||
|
subprocess.run(cmd, check=True)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -24,8 +24,8 @@ Aider currently uses simple text based editing formats, but
|
||||||
[OpenAI's new function calling
|
[OpenAI's new function calling
|
||||||
API](https://openai.com/blog/function-calling-and-other-api-updates)
|
API](https://openai.com/blog/function-calling-and-other-api-updates)
|
||||||
looks like a promising way to create more structured edit formats.
|
looks like a promising way to create more structured edit formats.
|
||||||
Before making such a big change, I wanted to make
|
Before making such a big change, I wanted
|
||||||
sure I had a quantitative way to assess the benefits
|
a quantitative way to assess the benefits
|
||||||
of function based editing.
|
of function based editing.
|
||||||
|
|
||||||
With this in mind, I developed a
|
With this in mind, I developed a
|
||||||
|
@ -70,6 +70,7 @@ Using more complex output formats with GPT seems to introduce two issues:
|
||||||
I was expecting to start using function call based edits in aider for both GPT-3.5 and GPT-4.
|
I was expecting to start using function call based edits in aider for both GPT-3.5 and GPT-4.
|
||||||
But given these benchmark results, I won't be adopting the functions API
|
But given these benchmark results, I won't be adopting the functions API
|
||||||
at this time.
|
at this time.
|
||||||
|
I will certainly plan to benchmark functions again with future versions of the models.
|
||||||
|
|
||||||
More details on the benchmark, edit formats and results are discussed below.
|
More details on the benchmark, edit formats and results are discussed below.
|
||||||
|
|
||||||
|
@ -84,9 +85,9 @@ their coding skills.
|
||||||
|
|
||||||
Each exercise includes:
|
Each exercise includes:
|
||||||
|
|
||||||
- Instructions, provided in markdown files.
|
- [Instructions](https://github.com/exercism/python/blob/main/exercises/practice/anagram/.docs/instructions.md), provided in markdown files.
|
||||||
- Stub python code in an *implementation file*, specifying the functions or classes that need to be implemented.
|
- [Stub python code](https://github.com/exercism/python/blob/main/exercises/practice/anagram/anagram.py) in an *implementation file*, specifying the functions or classes that need to be implemented.
|
||||||
- Unit tests in a separate python file.
|
- [Unit tests](https://github.com/exercism/python/blob/main/exercises/practice/anagram/anagram_test.py) in a separate python file.
|
||||||
|
|
||||||
The goal is for GPT to read the instructions, implement the provided function/class skeletons
|
The goal is for GPT to read the instructions, implement the provided function/class skeletons
|
||||||
and pass all the unit tests. The benchmark measures what percentage of
|
and pass all the unit tests. The benchmark measures what percentage of
|
||||||
|
@ -152,7 +153,7 @@ format, resulting in the code not being saved correctly.
|
||||||
|
|
||||||
It's worth keeping in mind that changing the edit format often affects
|
It's worth keeping in mind that changing the edit format often affects
|
||||||
both aspects of GPT's performance.
|
both aspects of GPT's performance.
|
||||||
Complex edit formats often lead to poorer code *and* make it less
|
Complex edit formats often lead GPT to write worse code *and* make it less
|
||||||
successful at formatting the edits correctly.
|
successful at formatting the edits correctly.
|
||||||
|
|
||||||
|
|
||||||
|
@ -323,7 +324,7 @@ distinct responses, usually less than 5-10 variations. This suggests
|
||||||
that OpenAI may be load balancing their API across a number of
|
that OpenAI may be load balancing their API across a number of
|
||||||
slightly different instances of the model?
|
slightly different instances of the model?
|
||||||
|
|
||||||
For some exercises, some of these variable responses pass the unit tests while
|
For certain exercises, some of these variable responses pass the unit tests while
|
||||||
other variants do not. Results for exercises like this, which are
|
other variants do not. Results for exercises like this, which are
|
||||||
"on the bubble",
|
"on the bubble",
|
||||||
are therefore a bit random, depending on which variant OpenAI returns.
|
are therefore a bit random, depending on which variant OpenAI returns.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue