From 4c6fd48b27bb8df6bf0edf1070020e9fe963ffe1 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Wed, 8 May 2024 15:02:16 -0700 Subject: [PATCH] updated gpt-4-1106-preview leaderboards --- _data/edit_leaderboard.yml | 32 +++++++++++++++++--------------- _data/refactor_leaderboard.yml | 31 ++++++++++++++++--------------- 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/_data/edit_leaderboard.yml b/_data/edit_leaderboard.yml index d15a652d3..abf52403c 100644 --- a/_data/edit_leaderboard.yml +++ b/_data/edit_leaderboard.yml @@ -231,27 +231,29 @@ versions: 0.18.2-dev seconds_per_case: 33.6 total_cost: 17.4657 -- dirname: 2024-05-04-14-33-15--redo-gpt-4-1106-preview-udiff5 + +- dirname: 2024-05-08-21-16-03--may-gpt-4-1106-preview-udiff test_cases: 133 model: gpt-4-1106-preview edit_format: udiff - commit_hash: 1981105-dirty - pass_rate_1: 57.1 - pass_rate_2: 63.2 - percent_cases_well_formed: 94.0 - error_outputs: 24 - num_malformed_responses: 8 + commit_hash: 87664dc + pass_rate_1: 51.9 + pass_rate_2: 65.4 + percent_cases_well_formed: 92.5 + error_outputs: 30 + num_malformed_responses: 10 user_asks: 0 - lazy_comments: 7 - syntax_errors: 3 - indentation_errors: 5 + lazy_comments: 3 + syntax_errors: 11 + indentation_errors: 2 exhausted_context_windows: 0 - test_timeouts: 2 + test_timeouts: 1 command: aider - date: 2024-05-04 - versions: 0.31.2-dev - seconds_per_case: 15.6 - total_cost: 5.9468 + date: 2024-05-08 + versions: 0.33.1-dev + seconds_per_case: 20.4 + total_cost: 6.6061 + - dirname: 2024-05-01-02-09-20--gpt-4-turbo-examples test_cases: 133 model: gpt-4-turbo-2024-04-09 diff --git a/_data/refactor_leaderboard.yml b/_data/refactor_leaderboard.yml index 9971a8374..8e2e2a9e3 100644 --- a/_data/refactor_leaderboard.yml +++ b/_data/refactor_leaderboard.yml @@ -78,24 +78,25 @@ versions: 0.22.1-dev seconds_per_case: 70.4 total_cost: 43.3437 -- dirname: 2024-01-25-21-27-47--jan-gpt-4-1106-preview-udiff + +- dirname: 2024-05-08-21-24-16--may-refac-gpt-4-1106-preview test_cases: 89 model: gpt-4-1106-preview edit_format: udiff - commit_hash: a75e7c8 - pass_rate_1: 57.3 - percent_cases_well_formed: 31.5 - error_outputs: 127 - num_malformed_responses: 61 - user_asks: 0 - lazy_comments: 4 - syntax_errors: 1 - indentation_errors: 15 - exhausted_context_windows: 1 + commit_hash: eaa2514-dirty + pass_rate_1: 50.6 + percent_cases_well_formed: 39.3 + error_outputs: 164 + num_malformed_responses: 54 + user_asks: 1 + lazy_comments: 17 + syntax_errors: 0 + indentation_errors: 8 + exhausted_context_windows: 0 test_timeouts: 0 command: aider - date: 2024-01-25 - versions: 0.22.1-dev - seconds_per_case: 181.9 - total_cost: 18.6347 + date: 2024-05-08 + versions: 0.33.1-dev + seconds_per_case: 61.8 + total_cost: 18.3844 \ No newline at end of file