From 0b01b7caf5a8f110e46faff1f68c1bde438bb932 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Wed, 22 May 2024 15:26:16 -0700 Subject: [PATCH] copy --- _posts/2024-05-22-swe-bench-lite.md | 14 +- assets/swe_bench_lite.jpg | Bin 0 -> 37018 bytes assets/swe_bench_lite.svg | 1632 +++++++++++++++++++++++++++ benchmark/swe_bench_lite.py | 28 +- 4 files changed, 1660 insertions(+), 14 deletions(-) create mode 100644 assets/swe_bench_lite.jpg create mode 100644 assets/swe_bench_lite.svg diff --git a/_posts/2024-05-22-swe-bench-lite.md b/_posts/2024-05-22-swe-bench-lite.md index 4021415f4..caa72c11f 100644 --- a/_posts/2024-05-22-swe-bench-lite.md +++ b/_posts/2024-05-22-swe-bench-lite.md @@ -1,12 +1,15 @@ --- title: Aider scores SOTA 26.3% on SWE Bench Lite excerpt: Aider scored 26.3% on SWE Bench Lite, achieving a state of the art result. +highlight_image: /assets/swe_bench_lite.jpg draft: true --- +[![SWE Bench Lite results](/assets/swe_bench_lite.svg)](https://aider.chat/assets/swe_bench_lite.svg) + # Aider scores SOTA 26.3% on SWE Bench Lite -[Aider scored 26.3%]() +Aider scored 26.3% on the [SWE Bench Lite benchmark](https://www.swebench.com), achieving a state of the art result. The current top leaderboard entry is 20.33% @@ -14,6 +17,8 @@ from Amazon Q Developer Agent. The best result reported elsewhere online seems to be [22.3% from AutoCodeRover](https://github.com/nus-apr/auto-code-rover). +## Interactive, not agentic + Aider achieved this result mainly through its focus on static code analysis, reliable LLM code editing and pragmatic workflows for interactive pair programming with AI. @@ -33,6 +38,8 @@ When a user asks aider for a change, they see the edits performed in real-time. Aider may also then offer additional help like fixing lint or test errors. +## Methodology + For the benchmark, aider was launched in each problem's git repository with the problem statement @@ -113,7 +120,7 @@ Some noteworthy observations: | 6 | Opus | 1 | 1.3 | 100.0 |**Total**| | **79** | **100%** | **100%** | -If we just look at which models produced correct solutions, +If we breakdown correct solutions purely by model, we can see that GPT-4o dominates. This isn't a fair comparison, because GPT-4o always took the first attempt at solving. @@ -145,8 +152,7 @@ to provide a compact and powerful summary of the entire code base. The map is constantly tailored to show repo context that is relevant to the current state of the chat conversation. - -by performing a graph optimization on the code's call graph. +This is done by performing a graph optimization on the code's call graph. When the user asks for a change to their code, the LLM uses the repo map to decide which files to edit. diff --git a/assets/swe_bench_lite.jpg b/assets/swe_bench_lite.jpg new file mode 100644 index 0000000000000000000000000000000000000000..32e2abf4c3295ce521fdd9fd1f3f6dc7ae5ccc52 GIT binary patch literal 37018 zcmeFa1zc5I*FL&w2|*O3LrDpd?mm==beBj-ch@GQV+#l8wT>3upfLPBcx2Xv1Z zSy)&|s5p5!n0OvCvoL-65g24-WHeMX+?zLXneGwZWBQN(Ts8nOZ@{X+t-!(D2f$*& zz+u8%wg5-~02l;lZNF6b`yUusICum^q#MX6sL%&WZvkLo;NW24;Sdnu;h|5vLGJ_L zF%fRxV-!TZBdddS-wKP#Gwj_BGNHmoY`Oj&a^~mOUdSkSad7eQDJZF^AJDL{vaxe; zatZ$?A}S^>@mO9#QAt@vRZUk<-{6Izk+F@foxOvjle70rA74NJfWYvG*O5`tZ{Efv zr=+H(XJlq&7ZsP3mX%jjRy8%Zw6?W(eCYf%FgP?kGWvOJc5Z%QacOyFb!~TV|KRZG z7<_X2MJ^Zs+&8hVCHsS1m{7T3;o;%nk-o?U1M2`i;4tA4?lB_X7L-NOvAT1g$@2!5 zP}sY|Mr1N(xgG51*8M1V$ysJ8cE5=BRkFV)nAd+vvTMP9$TbE)gM)!~9vmisA8>q1 zo$P_~&+$(i{697Z3V6XOb2r7Lxe&e6rhzXsOv;l#7gRUm(hyjjiCH$D9;^X)Q3CfR z>tDKyq(-bj=yu8RtFjEYDC-~QUjjCR(Y#EkTJZgXSs6T;jg{MFKJf@Xn_V(g&6`qG zYP5(Z#4-&VpcNo)7O@VQ8wAZ-?yBnd9;sBdU$8uguRk8aZg#yrd^q5iwUR(qUtL}0 zqVD#1DBr+{pfgO=Qe^?AlH;%&;6j1DcnKJDjw(D8yaX7Gui;p&I`hR5FO6}&eI6xB zXf9(Zy{%kuOpDqa^Dtvx8xK6porvog(?-Ot;FuKHC8H-I*V~3}iP-PKC_p?e;&i}| znjYx5a0z&M+-6beZRREuX-fQ3H_iDb082T{v4==Z9Xy`qXwPtKol7%#Oe}6aLyLve za&JHM?F`y{DRb-8tr1tE+3Ma~D^t9?SOcTfZ<;eFopw8tf`c(ZA4whqj!W#T7?C>6?vYY`i)na|kaw@pUNCY-u6MHqBWDf1dJTz5~0qCz2uS1M7>F+;pLJNU&zE` z3WjX$u)E_Wp{ED#W!9(@2knAdQ`Co3%lkJ?^FCThbK{16);$Xtu==pV8y5g|HCIjXfEW!Al(us+dHL}-}5{M5}Q2Up3Csuuwxs@=Mgb3rx{nF zwin^^X`tcf0&Gxc&L+^lAxl7(Q_++8aQ~71Gv`}kvT`yoB3S?X4-JPat zW)-G(kDAFng3UL#&>!!J30Pwgy3rQqv$wk=C$@BF7&}XW2M#xBgel^iA1-jV$+o|1 zINWaR23XVYEYIQi&NIwf5KGw+>OL^a%opXaLNzBzhG6$Q612#ji|XQ7rDk zNLy}*M`w-hsNI|bj+`X`;X4XRF?AZgs}FVDqECwq%;rjvpg|^iVD6m8Vw2aL2c};YcghPi{EnVoUkLKRkD%^^KPRx!!zOpE9=Y%(j05(Js6H;Z4}(X3BBp zI+OVFJWe};H<>320LzKPq?;GaO$qEJL-hF!b@zEUbFFD5O>O;C_7#YgElq<@M{Kkl z3(ucTuuIL?ZqBeVfizbv8Bx4}V2CpHc~oRy$_ok0UU4{6{mej_vZO= zNp7E0?<5V_ZqwX+4B?j8xF6l6gc;8BY7#aa2kAX*som~eh(l93J~CMNW(@{%D58|y z^aWAjUXe%2yT&!dSlOllRifXAcw;w0CMLer|jT2y}(vOG$N< zf6|}k#la#8rm`rhHg}xI<+%4$++SV@AiBqv+0s?{xr=Q!*q1c;#cuWymvXDw9dS@|8M7hI-9E7eb>j#Y zuit_#E1!zaa9N5Gt-#68B8KI!@S(>_jnJhQ8y4uXy{hSYAg=NN&N|{;!0`i;|R(WV(&H{;Rg^quB5ToG~3L>kUuYG+d6zk3hM30AtyWPp)9y3gf z0`rZffv8Qs&)iRTJBqMOnQeR#o^rvWGscY8?B0wFO^v!EoQW5giE%fPCk*Z2W1tUG zoix#|;(;0BR+n7G5UZw?9qNgWq+J+&3o_wji3ZK?4|Eco%W#0_W$Bjyd8)IF6B(GL zf|N_ZG}1ZszH))ZvAU5!!2Gk)$sTzBJor!Z{4*&2nF@cQ zDLAe9c0ROIiydSPP@H*REJB`pko-H(;YsYplfG>|dM8qpqP^QkhRMk3b`q-#~s`kZbXd#|jqC2B!b8_DXPFP4`8 zE(^@4tVU?a9Ivu$T}hYWo*2g@ecYW(KS|j>IPyOD5@iDYdDQjzJIeJ@oz&#zwUQ=|ixl=bCpv2K35-j6^SE%thxXH2b$bkE z2I>-a)aFXVcRko6$%+t>;KSLxB+(A!*9C~^vCw8-6gX1@k^}$u9qE$CNM~K=t$c48 zCP=m!_OSr}7E{xkUemh-*j@r?+?Xs5nSfQmNC00TbrS4P2Z~zh!Z5zb@T9apT9zt%ZcYW_$sRmM zZ6CIhcP4$xyzO)x(t8P`nLk(*V#(`sZVZ2ad#;{ zVC(S+&Z4bGBAuDB!|sz6AoTMo*;LR5`>?(&3m==?*peUaiAC0C7?L$IHw@lGZUcDu-rv9T0+)8pFfcmeFRD&^}_fHWm z5v80C+e1n>Z%;MLjhLFIlnjz4=oN)%J~XSTPlAt+y zZEM}(=`(tO^$cxvEzyK#deV|iGh>&vmi_cC|0I9swHASQAlcnyIP*JE=GR1MKX;LY z%t91uEK+`-1P!evtJN|a^UgOezff-(@a2(&C_lw$4R(XQ|36e zE*~=`$>9lq@&Ny_sxH5S5BV~^R*DF~>hx|bkUfY%Em2tW?u~;>K;a3osp$ZtWg{p` zeC(aLM%?+br#il$-^Mlru!JI3M)ei!U3wwMIc{eIR!hHWvV@boz#mEhz>^BuUKe|v zIC)`oAOrWiD|j4C(z4g{U<{yfV?)00JcJk8X_in;vb>&9x6)L$@&x741H)Xj-x^Y2 zqXGe6uae+@aWObRrd0M^Y&UU%X8TJ?#`)2g0OD@Q2}CG=M4#V>$Pz5m`+&dtw11K6 z7^vokt&g@BaaNy4EC+Nr%;*>R`D*BnZlSgs_@D1nI{G1 z1g~e%9>EjOPkFHaXay5ym0_xZEz^yS+?$(=pRN7%_WBhC=su2etZ@@AWs1#J>Xq2x z>)w>IaBn^mel)&hHl4^!;$ZO9DuFQ&uu^l~xF+|bkvuRi}G4{awD*D%wzgYGETzzS}|DU#mNn17yTs(B0mwXbvg>f8x5(sencaHVd z;?DQ>e_BBop3CNm%w8ax35ka*DJg^yd1cv?qDAzfr=FsWLT!UlM{lu#xvE_RNTq)6 zIxR*tcFk>sKvEUgOzxYbUBs5O(#_G%=I`>--?_~Z(2J=sGBqOE*2oX3FG zm0WF96CC`bm(jYdE{A<74gmM(&%|(TFR>A_NVMs><-UC7y}#0G#Ja4r;A6 z)p!}-`Cw+0%%IT~oVH7V-(EK~sAg8AH^V&k1PS7;nI4&IRJHh~`qOsaHhnL#-fcv{ zDWNT2n+L-eYeb&%JI*4%8Uf6TxEMIkggKBE0boH7F)QfW^yv~%0gSXc?9%|Ym(*{O zENWcXj7;JxO+o=K6^1(~ zS%%s3R<-OZ(7B@|7|5t5X3J^%5uZA_DpV%<7rsc|CO3rV{yFe%F85O{V9PL3yHMzd7{^Mp?qNC(Sr3ONsz&+_6+`dr><_ltym!PiTtk?!Bp$rw#LI zdn(EpTA68187r~-TTJ~ZgE?KnMoRES0;1FXFw|RkNTjze0iJ|&RoT>V(Vtkv37SVR z@>at>j^M~^m*y5CKhb~lUWT*FtYN-RM6Ry1woH<9>;7OAY9f_HEu%@ zjbq*(c_wDlFbM^pVH{XUS6@mVq|H9%XOA-Iju@ICcZ= zGf#qt41$WQa`uFFJPHS$Z%H`=R`_|_glJ~IHr`}nyy(OsuZe}wkhz6^ePMkldENWh zVh{YdiByGYA3EV7WpI#=Fq>GIrE42iB}AmI`F!yANo%0wSGYfh%&jJ&=9y5Z4rMZ^kiPN%Fn^4I8DWLXS(>s5j zi$3Y6%mVnsF|VUXx(FG4JV}+~Do!FNU^97}780{hha)Y=O4Bo*H9fLZRox5@`;hSh z!||ge9^*N`CAGzCfZz&kY59I#H_jN_$vD#6^7IWGP@4z&uLkmeC-3h$y!Wa-^G!26 zp_bfYNSHwo!92OH^#W$tpdNnHCs*XzZeq~{dN$gi5$@F5nv3&2_AG}F=kIBgB30+? zB+$$|g(?7%d=)&(j2ESOVBq2|LiBiUG~~In?QfC9MU7nyl~T>04Po=E=Pv;_jg-r| z^p46Mn))!x2qf<&(8`Dc9R(WZ%US6`FI^d3?DXffF)jf~HL;X;RzZtmHpDh%zmd?w z^)0-Zz+W~c-KBhUI#GA4>=WxGtwweDDM5P;@{tocfWykWR*(>2GdiL_lZZ}&?8q(E zHwNV0)V;Srgvxx^)J5N)co3#uJt4-V2FF4ykU9TM@;0uaIag-#S?;P-QiaJ+T$AZNg6^4ro& zz;ec_=*T4?Au$aKLKu8Y5okl4-CqNuuiZ%Xaug^rB6=ADH$L{>(#_`NVFzqN@9!Rtc{@S*l` z`yOkE+G+OtQjI!ytMSocK8!pI>DfnFyWh(xefQ?ymwfdE5k>jAb2DTP>aj)TjMH93 zktBkVER3T~E&;vLpDMMMU=EHXdO*xb~7*w7p6IhI#0mlygCzP9W&t@{HkL17jyz{RfOk2HnM8`|4!eNaTx zQ-A<9AlYXA7D4lrK^|x0OShP*cnHq433uo+4`E>s!dZ zf`e{`u-#Wp`%MU#e=iO5H^D=`^$67D|K2zM=YD`myMlj8d+B=*q*uL%V+gBsY_CW{ zHv9$dMB73(Y}YN{aGDXiup82rn5ibo2zspPHANIG)Oa5=$^c;f)Llnwz;T3DpIbH6 zTJy$eQ+T1mi}p@j4{^MrjCvz>l*t4$t`Xm=~2mrg` z1Y(f#KyHoTvjIb{?!^+Do}(nQ7y9tIg^5v5?gWQb_XV!9^t)HN3M-fx)ZpD_eXm1c zZ7EY`SrW9>V+lL`*-6T=S>_@Y z(-bj{V8UCKgtz(eKPP+87@caxr?&JP4c{%ZscGg$1tnu+Vt*P{WfU91iw6E=Zyl`E08)5t?3v31bXN=gjeSh zU}8omZrRa$6&-?3bT0w0kjN}m=l1hLs9_0woay>o?<~_wa^MQ* zuYM1C@IL#_7u&Z3;cJo*pN0opQpC0pyRQnY>+*m*~5oO}*f zFZ>bI2x@$K)4KcU?eiUqx|rS8%pfuvDUM0pU?I;&*{(Zb!I;7L+snS(u}-9K9Ykob z?0T4PG-3-aH;*Y!`sBSn@=Pj`DFfRp`@?vaHcoKht_$EoFzz&GI(iMCr^W|pYDIG{?o0?4a zT)55mA&$FD@PnnporX2HDtSAGn@0Gh9Tm03wv;mt*sx45h||OIUS5tCeSl(5t-dyd$xaL3{|$` zw!> zjJ96pR);s&IO4>Daets>D5t72}5^Q#FmN@Jb>!%6x|OLPcI!;stb>p@woT z=Q$(!*^O)820OQ_7EO>=`-^8!Djd+bHNv}N!DM`yfPWnygCY8XfD8WYNcL*m>Zsd7 zy_aXtuUwd)&nBzD-HqvOC_uU?_mj=Ck+GAKPlAH-!$L8>_9Tsn`WA~HL5KMor_e8E z8|?MGeQ)w(Hgm<|%TJVk(IksE)gipf7rW;nx=KBH(8-en0YgL!bDV+5KGOcsiPQJ- zYWDnmUfHalA0|1*IZ-IZ{$X8wwIyg7R4xHg{5uS*YLU^oz%CLn&Ox?c%M-|*hfqc1 z9ryFkT%~DK%Akry1vsI-sIh2kKfSw-aTau*LsO0aX(B3+I#(*t;n-U3#056~bIpl8d`)|Y^zl1sn@Xpeth z>^h}Dof6LL{qIZV3Zzc141JaS3-|wzd4F@t%)2@ytsR$hG5GWFz3R?cDV88t8ekhr z;KVTvNNDkRBxSESUv=yYp_%}X3)ciGzYimINkM25^xsr`h4z?n-y9dt+lv}DDmn+e z$AIIQB2&0SG{NpXTBZ{6vGI8sqo_lyofFIA-9hf4eqUk2}=o za}XPEQ9JQaXZ)SHemVYGx_%tq)@U*)=`laHK}~~;!HcDvZ(0GQ5Q=4S&@sW(DOt!t zngBy?w<$nG^aWM>txC)j>GwV`Fk4ZRX-HcPv~ZGL1iAnl)1+Y3m^< zK1A@E_1c(M=Yr0Xg0uM({I|| z>f}N;rt3vFS}o>k(e29jcctQiK&wg9@hj244q5c~zbf#Ld4F@r?$-zK8x0gQ2T|^xr^VAnV38-{8 zUAVgd;~h>7*RxTF+nhBGt`-Z@9!{?Nv?N``J$2iM+@A{xcR9Y{mDs9Q)ge7YaHg<1=EHN^KJ2PEe6X1)U+#tNVpX{LQGKVexJUA!-P9%q zkTOgEcuDi|QnTzJYZikN_FJ< za3UGKJ~ze1^AvLfc#e(Fj?t*^s$S&`w8Boiql_8%l=hB@av7T zq?5u|W;z3z$Jo|?So)YZ40{XHpzx{e_msySzP8{qd^KvAf0a>gd)ykVYeHO$RGG_l zJ&(U4t5jcw27M_w>p%%{$v*p}R6EkIhYO^ua&p94ngeXlz68|qhvgbiB{u2E&wVeD zR+#Z$mH7KQaiJP9nqYKHTiPFGxqfN-^sA})%@+1`O5EUIRE?>A@!>?pZ7XF}ZRq!_ zDeX5YX_J=!mb_P`o4hNFxcmM*lep73F zk#7@q)vErWAMSk@IM?$(kX-xck!4M!`>#hH^YAUj5=?k>-POb4y#6TJ@i_xrK&f; zK{3b*L;-M08@NjxhCiu3Iy$|UlV!|Q*2Ice*5n->VZQMU&Vx(OQ`VS^XhN;Hlb-Tz zxH$fZV)z0uxfxfM)HJ>Y8eo9AS)cG4W>QvTI5E#?HM6EXnpkslRUoX0?SAlbr>Lop zYLPm;cXWH2ZOQKlg7V+lhKc_=gl(yPE4P$`wEzvAC_O+qbVoG7hW9mL$0`lNL`tO` zb7;+oiv~Z)PE*4eLb%ftx=zJDkR!qc0V$-N2OrR|vpAKNmU!n}V3>zH zD=6=&VN&tH{gW$d~^xIT1>e1y$7f+AraQ0el zqWeww*ug&8D`i^bgSL6>f@J~t>NukV8WodK{4D@nvhgyLbg)VOG}yb*{+KH+i2i`7 z%Z#Pt5d~wV=IAl*xs3pk-9P5*cTetrh54Fi<%V72Nu&|Zyyq!JyFs~GX%P7CA`j`9 z{M_%0oA)g-un?_d(_ZOb>?)Xg2{2z#c7Bn#;Py`FB$nN*5h-2N*%|6omL@?K5?@^c z@cFM(oD`#+^Al1Il8~{!0^45h4`)wA`I*f(E4PD#A$HGI%&pAJj*;-zk$W<9k#gVAa}EdiM&O6)dbyxrxm zksSn*%v{ftUOovXj?-oBQ^eRbg{Zk?Td|q~wsv z6;LTvJeZKcLt|2`6$8@Gk^>*0JqW*OgzQ}c7^SZqgFg{bVZUg$e@rQO7 zN>XGS5y+qHCMK;C);YZh}So&RsVl>l2KD7d$?{;+8Ak_$*gZeOCr!UVum+OBoHqUFWNPr?e-8qG(lC zJywNMf1k=pGIK)l4FVk&`h2(>0LkVGPtt&d8dRj45FQS?Rgc}`-!^H~bYcoNjtguC@ew+wm> zr!Dyt<YQY_7^`0qe*lPTrK~EKYg`>7fNJBo}22gU#;1t ztXH^nwB{}FQ>SxiJr!SF3Hx&b&`zKIfXLL-?XE8qd2m`TYYE zk%j&e1&bPo@V&bPxB=sB?Rucd_#ppw!G3;dVNPW!y%!V(3laKzR2ko`H39<8tt|~P zc+8R_&Xw#Ga)?k(x^w{FgQ1cSb}rg}Ag5U`REwBdo)6E7)VnyT(k)?#52qZrArjjG zzzuA>OF(i=rS=V1u^oMo7Yi+I!7Wu09xl5Yxt5ki714K|^CK)U{c4}V>T`%b=_X98 zY|&-CxF*1ohTQ&8AGh~{@V0YRhV$aj#gj$E`w$dmJ(6qz_So~&C_KPfDV*7au#7de z#kA51K8Lqh!43}%?1^e`@P|0pMQ4w$JDaNc#@Y~d$9>*g3O>*CQkdWUg1A~&gqSqr zhi16Vr3YXlEart3hfSC{=@^k95qefXqap%#+aJgvF{Y>+J3y!sH3tud2ZU7f;_hxU zJ>J}pcA!88yf7qO_Ik>tcp=<;mHK0=bENvxXEVyee@1vI5pc0elo0Pfll6pBYrbB| z!2aUG*eEPVoskR-B?&ovDXCQX=NWlLf`0LpEnD^u9~JKgapFhC&Oabj4PVK&Yn=a< zf%^nya$sIq7msY@B&PWfkQMM}=B;OCe0E!oaVo|Pg5+F1Tz3G7o+-1#q zp>L9*mX;S;!Zy4b|Ed}HH;uSBn1+Q2@ZiC>?+{$AVt?K>^iNde|K_Mj@HHN+Kb(Gr z2N$g;lOXrqC6I(>2U%rFdgMvLj>zS*xBvj>-$Up(p`kpGT zW%~O0HIn0zlMykMp^34mMySCj`tAxGaQl~v>*@!wMr;Gw!bKHC5Z=C-0dR#I4Ta66 z;GExAYe?AMuq=(sKIT)6_<(R@y2RQlZhp*zV!%~6*@US^O{*|tT0E%o)9@k%4X?PI zCC&LY{5|i=X=RfEPN&DTGqf||WSK4rki+fI>#U*)ZUa!y2H<@-!JsU+Lk)3aaINO$ zej3%_S!ZOCzzKlRkp_6STfBKO=D|jt?!JEk8U{@P3s3vtR(`&M{lOqMpKwYvh~{>O z?vL{BE&awPMirsr8?@O}!43MBfD+2SX8h%u0RNh2ta=LFG*!&TcCkOQLbJWiKYDRx zH@GPd0_U5wn@<80j}tfCOVwre`I;&1=R*@^ztN7l9&jF|BpR=ALdkTA#+*lAICTH| z{KIHKcgt~*hW;Kg+k8U3{5Si)^FdzC*)B|MXX{deb)pL~zX zI=7s6W&3%jpg}27fHTh9KE{}bK{(Qc* zaSo~4x7LRe7{B(yU!K8rz0lJ_P_%RF__ z-JbR_84q61%0KT_{m0M!%8B~N&;0}O@WJ;D99aif@r~;}(PR}+C1eDg;P0$FU3qv7 zlT@y61=E^;@vG5ESgT7Y8it;^3Cx5Mz1TdgBF5q7sS6iGj2ClXM{n$qx`tuIgq@M=Q zazWLjyeep!?-`;0{OS9RtOwWoeX@+Md0h7LVA&B2AB5|rxb%ALl!UHWe}2OvD`H0o zwVwX`6}KPgnF{MQrWP+1@?cBt3!-Q33pK&6_*yT1UTJIjUp8g5K+PpXgKL}Vr%f~0 zU7W2J_=Ah7QDBp0ihb3)d5^D_(Qkfu^k15H*w>treCr=n%t*};BXj6lphVer%~UB` zmsj~t4O6x2$9TD%_@$1jFu#x;2|A3F?+s)B@JsSy+MttS6*xiiacBVSpSAOavHEL9 zD6-kLjj6HtN6#7hR>$Dq#h|bB!uxaGtcg|PTZ7xVqx6H8VW?rRLR+X!9U4g?*p&*U zArGh3`wLjQ@|sXHW2mStyTS%ZWCz`-;o?8(Bns_j8+K)CYpH1(71+9ub3f^#v%VjW zY7K{s)X72M2wTQ*c*ZTcy|GF zAP%6l^x4edYZ(Nm)M@y@!Vlr87_U#2q(7VUF9$&sYejy`$};~gDXr4`NF8(n-6ar7 zq7NPP2@EGnhuFkzeaD$sl z`~oH^+4#P%V|=!vt@oJN60OZ;V2(coithlCk^G!xWA zG7$j*vp9Vl38~3RMZQVIbCYr$spM?#Tip2FXGFc{JHXB=e44>ArF)I+HVNNzK!p>~ z-62TlIq#AaOP(JY{;Hu;#vc82BQp|jE45TM*1uoR(UTxj;y~;A*(e_X*(NB-~K{3F4VkJ5o{jpc#l6;q6Sy+U)->+DV_dmTM;z<740-wcN zUjc3}@o~gA;aMrL-@YS>gGR6}c8qr{VZFZ{)rCS@=ScC+UlovR> z*^?#|c4>fUWT)N^DPhBSH`eL?QdvrY<>CR$*`(Ye`3p0l1&s=_Elez{8Fx%?@YkVl z@O_GiU03Dn$msoEfxnmiy^js|{f^0@Si?;T5d*_(wfJ%He$?Eo7Dt>%#cJqM=}LNh ze9O?#FDH4uVaHFS`fqo{l39QLJd-+z6|O}8l@}Z>w$35i4ZgXw??bAj#O(o#%LDq9 zuvF#BgoP;F#wg`ipoU5kpZ{vlH+l`vJ|wgE*~vwopDL@l();ocIL#HFrxgCVIV67j zJ{HnTVjE;Y$9{@@rL2E8Q-5K<{f^M_<-QV0AdkGxtl2}!oD8uFYN^_!FeUXUTmqRp zy>!gM^DNtUz)|DP_-d2gXd}$sG>s-EfR7QK({?zAoEr#E+x`#b+p6M-Z|^$zk@p`; z=$hm)?M*E{R3o(0Ek_W8Bi*qY@i2rN)^P~GF(v-dyc6{v=a^MT$9kZfl@|K0Xc}b- zyN|0BQsf*eM@=J(v1`t#AmE8#IXNsZ@D{h)Uw;cHxe{8jR7ba5Gx}W0K|4($VWPCx zPq6CxqYQMLGKnM(laL&PTJ2-dqRRZU9I_^*uw+J4JB!}ikJlr;N3|x%qa?mp?Zgh*Rk$B4rXYLaP;D?_%fOV5hy3V)rSiZQtD)G&`D1BvM5*2TVKEFT1@R$j5 z^wZK^@((OgcbPQqs$xsr_$YvszzlMpF77>})hZiZKfeRktK zoyP|*of>Y`FrT0?Z%ztTF}i8&qx{<4M=g4VLk%-+H()uXkJ}s%VXI^Q)O@s2yCfl- zw!A{TG(An*%ERQ+k5pA3F8v1CFPudhu8P^O_ho9N%$nP^DL*qcxqXpi&wiqL9~v&0 zu>$uYOk&K5ltLKIFR=84=-rTE&Glg`_d%5M57qEi|hf!x7*p;HKHQJ`k;#+ID zA-yzVz6Ow6)<|_G89VO9Ste>MHpXfq!|_aFxReX|24*QX0OW?=0v4FQ{$F@mKM&u} zcu{Qnnj0|pY=RRX&z=+Q-si_!mtV17wNYJ*GF&=*64>9;d1jyUg8sxx&SdJrP=Q@! zpX*)QuGoW+#p=q+Xj`R5b@Qe>@ry-_<0R1}*tuF1l`f8SGdt~|k_@9r9ht%jb53%g z8}5Zw4B*I^HhRame2!l8g_J{ccN%L|HXgoAlN0BJ494v0hr|3odkoiJMBA=bvE3|m zU-9HxSrqKyX7oqH;AH++Zc85yG~yic)PwxZ=ocy~BJX)oFp!5oQWSwB4|fL8b{|?` zf>T;Z#%vs2#6H4PO_PIILa z!QwB*@OBWN!azSdw4PExT=+4R$7tt>uo#m#FKH*%e`Xfw)4r}1KYMl|EZ;v@GkDN8 zU`|2j-noaQr*U5Gza@&!ErS^UGlc)&N-N+*M~%6qmODgas*nFxof@T#lFci@g91pX zCibNmO4Wz{Vy>PW~Hu{D>p$=SDc|SF;V*3`U7@k z$y%WnqXvSm_w>8XnEiO+R_(EoF`SUx-1M*mrI-=r0k#O<%vS&v^cu_&=qEgPS zaSoWOf(8YocuXWhf{or9w& zS=EU(wY*vW7loyOYM7w_9vYB5gaxMBYAM!8q*9Qtd2t(0b9A0#GiEnv{Stt^otP`{k*6xbt8B-Z4Y)eO7LHivl+q!g%t0FFL*ukPkc7wC1w>{24_9e05ngnArE9yw1jl%m!hDq*e?&79Ao$O4cEi7VS{o#O$d9dJ}s!_d7;zk?x?;$*d4 zcID30fj1@@{6uOe3&lXPDsM{G@o+nz?VhW41MI4_14?+!mt4#kl!|L&rZ0>Ish`)( z^Vs(~H|&aeEDRoI)`%n-gHqOaGlea)>Np3^iWsaw3$*v8kqzDN??7S;z$lVt%(5j< zMkmITWE#+)zd=tr;QT=3wEfvpTYk{a422+GT{2p#9<9F{ZXWu%tD;Llkuzvd9Elal ztMtNpZCD$B-TrtuDxX4%uM-Z7iK7i)$jO!1{s5vl4ddx4^32h@t&V7?Cag8rI80rV zbfWNCbz~2_nT$9wWDq8MEI!;gY&FF0R$Vw5lYsg%^$mo=)d;1WcKTBJS#3P_AgaM> zwVk&22|~u)-mzpg?}bAH!o|vv(p=!>+Ib$ZJS-BOVJeFA!m#>l7$=`!c{eJlYOyD1 zHV@`7cAQ1I+Mq(^zMlh4Y-|o@50r35g5@d_@wCQ78)Mah>th)^O4XZk&q=OkO=xtx*<`JbDBC!awM3AQy*xFglzgMAJJ$CeiJAb897FaHP@5q|(K6ur z+m6!fT@7#a>XbYctui%mZ{eiWUFMpqGaLV1KJO%6#2&dz+N06-Ez z0EpRHmPJKqR29?7miWl%y+X7^%6sl8+@_`+Vrw_Z@&P0DJs*3^R4}dEYc6b^NHTck z-l%>I6vI>Vtfa@fyR4-Lr}C?rKEA1ioei)l?@+y~JosKFS2Fl%QAD>);i|`uXQmC# zhed5+`L~&kQ(;E(>~J6Fm;DLMLJ@tIt?~+hM?PQ>L+ePMV04!wMXqPd5(`~|>U{xA z6*vvPDh7{byl|GF!!<0Yw!}Z@0fmF!jxK3F2H}zijuJSGw=KXayM30jZYMA*(B;FH zZg4NL{ax|cC^-o@UK4ilYJRlelr7tN>fi@I8z|ZGnZG6T44Kag-reFqprgI9)K^MN zPuA9_J#9F~gxH@lu!(QtG7v>e7TKeL2FUO{O@w|vCu0qBwJJR&dtAbdvLN$=dz1~^ zk|!>;qJ?OamzaeyV4!p-D%xnSvPR61!fS1b}J(b9q}UyeFSjNGI@fxr&h#_1*Qf#w%P0G2={qa;GDWqjqfA@i>X2p@tTzY z5ygLNmHhrmZJR01oiWGq#<9@Q(c4nQI=bgf3^c$;{9G%BNf-&aRBj}KpBP-Rv$Ry# zk3Ypa@o<8;DBq$}aH~L81>vK5g~!VBkpj%m_04SVdOr-9n|(EcX&}WfiHz`_ z`iHs#)gjV~&eS~R*fnTTsd$^)+_gys%oIgzBq=iMHp^-rn8 zhkgU+5~=3`MLydlZ*8{=r`wUX$;(9V%YfkzIy$oN#?)o(s>?()#{^UTRx~g~pjvW! z&LxW(lE>{|0blOwxvd_4&Rw~4RB4v&c-zH{QSUzO8Lh{*1s!l2HEvt0nUqAfM}T>& zG@hZoj&r94f{S50rqONXkV!`ufJO26MC1*uYADBM@9diG=)CyIjlh(C(FcQna+tnb zr7*DtCSyR~gMd=rrxr9*e2)-SM-LQ+Q`m11sEv{{81W7kvV z@HNRwbUfK0*+wU@c8wY?VsQIhdX6(?fI{4Mn#9quv^#(LP`Qw4!vN%fM218>S&0>e z&kOxu5;|OIjb=mCnp~xptP}EWh7iH8Eeuct8z7V(LJWzwFpF9RuZc{_de?#k z4=C+iJUi`o`ZOnfwLEcrY${`RM`@{7s?M4Wjy!)xzyK~7uAQhAU=DU@Vl?6-jn??I zl~yuvhzlRFssnUbCM25Yx1Hw%*u<=_z`?<#$qp8{207g;uPgz{gfk!MRk$L$x>n+S zz-xaDD98q<2=;B=FCStJqUB6mBumg2CFAJ9uyEdcTUxzI)joFgR@cJZuJ@LP=&5_2 zW#_Zdfi6NX_gy*k|Es<43~Q>})(s*=DTdygbb*K_Z{zG zd1eHE1A$+NHkxV(VVoiL8Z;nT3_V&=dqVV`^ik}ehYw=-UhmV9Y;@YE&kVo*Bv|;U zf$pVpR-TPFdK!kx4%)bqbwTASwb4`Z7tfigl@f)$Io>%&Ccv5W>Ldv z{Dl^akmgY$DP6A`+7`J`>dSi|Hs~k%p_J16yh{w%qTQ7~@hAjj?6*r2X?2PBYG?W! z!ps@p&shv`yN8xhgz?yytxSbN@P9&El*uG}QW3dWQFN?B<A+eywSD!)To7+j41;9lFyrcb&%(b0(QLCD(>P>b8$dv&eT$7&5=O`GFT*P_qOF z+ZS%ZH83D5msEzH1-|5gi4#Zdo3hsgY^ymbV8S>3>aSbJU|@qP~ZhkL%_&uw3I(O4-g?K%E8N{)nOCH50-N*b8n+^jKz}t*6Og(U0>IE3xH_hPeIcIVcqAhJ+@MjWd8MqIXJX<{Wh6X0Wl5o( z43oz=>QUtWIlfLg(D<@(m>4A~PFW~Ud96)iYJp)=ZkO3CvbeP&i77%-_?|hxEDz%V>)@sh!!fM}jsDU;2Ubd+ zR`P+XhrO_rc278-&Yr_|X44Xn3};>@!)|9HN8P9Nk&y-O6h!P>)6X|Eba|g&r24iV z*HN6d6wmOX$Oh&T@T=qGE4TNq)=o-MsC|VahTLyYj=F&|2V=orz3g1Nn3Ed$Djx%} z)>?hteehidsOUHR=iqqOZF#J-IKq&P@qUUSQ8Z&7c3*Vl*fbnFqX$LGH8)fNt-40)2R&J{e;PJW;yioT zXN=~=^7_eo@f*d+D7+~&d7Eo5%7hW&@96gP>YE$$E&A}UC%`<)sAGiW-MUdZT(IK( zCWAl&@5?-{BcQFoB!AJu+PbFkO7KpVJtKjDt-TYv?o!&#pHT77MJ)e1EcD;)o0{JY z&YrS7YY{mdA>&zlzTHDCeqs#<7JLfCi`i@kzD1QEFJ-`-KE!gX7_iaYjO$J?8Vi*m zpzt{J1GT?MM1}jqr|{;1mh4Qr64yi(tqwdjdfMByyef6`e*+$)BFrXK9ZPk2cLyi@ zp1>^nW*+kbKLZb5VvXQRRoh+EsI*>$$BGIV4%RhA-QLRZZ_-2T3sJ5CK+BYQs+oby z6{wrN`lXxSp*g+#tXYx<6EMX`$frp-H`34YzE!HolLO&gq%d1KXuV@sI8lgJpxd#eu!?}U9zw#=EJ&VGJ3`qvCtqy(B#k=RYj*f{lWNG>Q+?IjAplZ9->*UI9|6unb zMTu+bRwmi9gW;ckd3~b(9&cmZF(Ib``k*$MomFNaP1r|F?CGr-5){eRlJ@+Mshh!K z=B)=(huf#Bii&&}+dJ4QT%}@#f8c?5LHdD5wsXu$KzErIG(J9*9-CIV2zODZpBSy0 zHcp8iE>VeX_T4gG4(rPRMR768Nq*b&bjS(3fbrvW={}Pxy0_-W9I!wdB&x z3w@wn9%foZ#T_o&FU8(M?c0s5>fBZ>&-1%W27wt9l!tWPJ1YTZJ#m1qG|=^+2g}L1 z^08S(FI!WqO7%>}h7@SqoNzZ~mQ1e{hf0vVume@_#ZARj!{TjUx7oUsLu*;z*36*p z7)REAlw=Q@hqxH-?xPmK99&~HEU{=P^ezLh`G^d|Xwm zu(6=*^CJ-l>&O?osk8}~IVqLZUxc#v@kxvZNniBbW*F;)4-Du4`{T7C7k#?gH*-;eGNs<#_xIrNVIN9-J|EMksVUN!x(pMEiqg?QA zz8T=TIJSjr(OHDdoY%H50yFL7ToxkSy`{OGAr9Fk)Me#cn7~EG`>!};-F`rxMYl9l za;~VAQX*joJ8*F}b?F0i0aa&pITv?rj>MY!rZ`XH=?fX_s*b`NZTGH*yE{-0r7HEL z2~D@`PnB)JdB@HaksioGe6-VbG1rJ#iW|Rf`M8^Lv$-9Tk%+T`a&s+(mAjtRNZx-m7P4S4`i;o{@+j z>Q-dHMzq-{=>X26K(vNPBKtWLOy$+mi{(#*MXmRuPfb)Cfwkne>kgJgh4hi%?o3N$ zD2>j81f7w)wP&WXU$ptALkNy`Unln4YhL=o;_!-cVmb|VD!I##<#LSP{|afs>`c4D zB-VGXd;I)Wu>ULYVSSaTbmVt-{6d#%om0Zx0K%@gUs9WIhCpqpkYE`J?7C39M2tKb zN@pF4&f`?)gMLOhWcWI!BG#$zT=7~BJN$YwOs)dT`z43D-Jp2E;#z#qDR`y@)?_8W z+Y@)u_E>vZtYu|$Gi9xlV-)Fa7%+9z!_H`fQhfBKPIHX-WJb)}|Bn|a!USK#rYQx> zqY*9PI&C_S_Q6gaGeDayUCzT$;F8Io)N)ySQPF)6OjCvak`awT<1T8#=Z0*mcu2UB zzmXkbdmPpeiEAhznQT88v5g9BdtBFUat%SGOrwn>)u?1HqR<290;mge?PDk6G1I3H znat6keRdwq!_2sLF2He{N{4jsei6#KX2GEUbe?(*jbd4K)x7%Qem!Z~{3B;#8gV{g2YK4IRghmP;}P5LMb}A> z5rOKQ9&`M8g`;{w6<*h#7l(OY_uB>;={$Hu6b=Kbm-F@z0bTf|4u<|&IR2#NW<)O3T$qZSud=2 zT4U0_tc{JC+*3%vTs1RSlJZD0s&XVn_bM!2+21`;lTce=pAa)pGuKayzLS5uUN3z} zc}qR#<*CD3*+KT&ieBe9QXWKGUjiq6eCPjMmU zYa;BjRo)T^Gxu~nV_P0a8i@(hHSucPxZ-Lv+JL0G*Q$+eEA9QNG9@FIASjpmylXYx zlU9N+opmR`upuQi(GgMnl1=7T86dW<4X^_2;@yJbR1mj@UZ>;GZN^)5-|^5~r*omb@xD=#LQYY;1x>4~WF!@nTAkQgvwB;&a8sHm!`J z+=3hnQ0Ty2SCd8}Uji{@(tEYu0;=@tM+O^ExCS146S@E%RaMCgg2ZcfW}LW*HODP* zr`gLTd>(x5`2|H2a$h2ota`g&?;mv24_S1n_%`x-qv*hmM2z6|B4Uo)xieDPp}VtL#yZ5 zmn9aoXp0R)knX7WFufJoA#7Gi>DmW}Pd#81SStMR$tvb)ZT%pGundfqC(Qd$O=6-^ zZc_C>bRIu4<6d>x_@cj}1}HrXJUH#a-EAiFV-Q!zPhgku=5d$0qxP=i`US6C5@7EC z4Jf@p3j%IK(lBmkhD9WtX)?U|xqFk`&#EU+DcZXvbFI!AU?0v|gZ;=fvX{}l0jrCc zp|WNd1>;*u0_heNZs5 z)GDUWB#X(bA&it1+3T15wBb){e45oDYa*rgf!X9su6M12Yll^IB0Ul@sYv%M*$|qoiWpK_q16J)LmnZo z=+Y{ce>wrU|EnulX^YcSMw8u#_SKp568T$}*ojOZB$x8r8f*y`47c84HB~PtDQSdz zs&l*_3o=+@KKR+^Yx89CxU@$fL~|BQcxB-;oGonv4h5}>gssE0@OR`mdvg5d+df7#e|hGLB1FW&q@DsK@=9 zaP}?r#;g??;@d+H+q+PIdocMXn^m_gm!1$WN4N`~WebkLU6;+P#p>wjcr3I(Jzf8Z z79jUh{X%ha^S$rg^(b*+ThJ4Uwq-L@GotDCUN5q$ZQ>In-Y@gXx87P{Cwf!CPls09 zNy1c`Cb*5aFkr`U zSNVNRpClh&C@CJ`!g1?tadNQQ28LbV#Q2i*Y=Up1LO|~%{IC}Tm5Kg z@op=rW0_Wi8G75pPmY{Hh$}Ms@ifQ7H%j6SYzs@;NtbSIh=)4s`lL8DuH?CEp>sJ$#JhK zuGD_*M2Nq;FbX!M07U)pQf#*M1r7_DUR=*Kgk00$b-6G=&n z9XIgK@^IQk118667TXXBK=ywUKqRw>y3w`=>$8Z*^=}_c)>u>(g?B0r#8v7qnfi%_ zDrBDzO0U!gmOeVXBP&dW{q|rYd^_q;vbo(p@9g13)$uRCFOZ~PjXmSWhy$zgoNR^w z>}QDmyY;EY#omXcZ!j_;WT{M5tz!1ILPl2B@~1y+4>$u>!~}?gp63eWKGSw-IqLgI*f{@fQrg!F?(JXdl5b(9 z-#LnJA1Zx*2Q2+?@d?-F*B6p7)b7_=aM*;=$y+obMjvlZ(L27Rmn-W@F8TpH7APH z^u4ucmtV3dYN>7UcKxN5=d{!mZXxranSN*n3)ue=5GSi#Xl80$o~OA1_y{l^_s-JL zX>>&>3~}pm>Adczp|bwsd7sMaPw2KJWZ6kR8hjVqYhv1L7U(bBl)&DUPy_6o;vAMA z-i9l*8M;`5dLfeA0V%DR);=7a!jFy0b<(SIMhkiuX=djegJ1WVKXDdg6fFf4 zd-+eIlx*+&W1sm`hu%E2X*Fpf9IB zJ!$L+J*<*nuA_O8pRC?Ta+dWD&4jkhk% zQENnV_IFCAk9G>xM9I5y(^DE|Jf%dJeg|NsfI@aFYYFcJC^mCS%xWxcwjkia;&igM ziKF%BJd^FuM|43k39`yqk6~cOs`qaIrtzF_vK~jK)GQi29(}B)+Yk$`P^mwM{ydTIpi7J`05noJdvEw^4Pbxb$YNwJsI4+BE@Xj}sS+?L|<9 z&izpeApc$FrN=B$<VOM_5TRu`pbEWHT+OY4NCFjG+OLnHO!^ewz(F=%R_fpFn~NXm!)nn2Os_ zbZGcan4X?lW!`f{pBn`g01H|&ac)JBRU~@tN(lW(SDM{8aqNYthW}X?gq&CKKWc=$ zq&BIf0KNbPK*cE&4|F4ZK(qr@&O?tO%I!Cub7iDnt$3C3hT8WpyLwqqllu+*CbLUG z=7L#!5a2(-xgx9n*mizU^l|e8cj5X8w?B?{+((Y+3i5=SzdjAU!DbVAI#@edxu`c* zfXrUJ*AiKnnEWpBHz54g4kY9{zd11^k{gx=Ya9dD@nC}J?Vlw6p#@@E$xb7O*bEe?=>tD3tzrGGnj zr8!z;g`$#miC_4xjT}FjS_xL2(RW?Hlr7ZW>?0P#M{#PN_dt{H@i}l>)ilY44>B$C zKGNJ9>ibqPg|(R{)$d$TEx2{6tEcTPZlAIMKBwA~>9 z2plKJz92sBUhnT9tMFE&duA3vpncw4(&*aM)Pxzss|2Ew#2*_cVxz;!4uQO+p{0|7 z^(6Fi=`sPzU*$?X9|1sUf?15I)Xd~B%e^feD|`Xg#P-bZvAmfiey~;U|FPC=z>jI_ zBRNKO)lSDJIMrY|p2TRUa5w-I$}mIv=5nY1Hb=oK9VzB zV3YN$ll5;asGFM`nXwOt8k3(SuD3QlmV7gg=KIK2hA7<*SSYA5dg=!SA4#XthM)uh z)mVD`akS*Q1Dtu0|AG%Mjdw1L$?EG#2U2r)g%KqpMH}+RLcDKVL`zHKB z$VJJDIh6e}&yBf{WlVDOL3e69JuQvj@e5s3_1!_0$|C~r5Yc(G!iOmH{zn`P;DbXP zk0b8s#)kj)u$poO^!4d<9}J<=3u$^;BUKo;rR+mHpfkhBoMiXR^J6QWH<{)Yturmp zU14(>)zxcEdVyWI^?3TpYq33rOy)DV`?E{H?B9R|^@#iphW0Ms1;Hz}}hUNV)P-)Y@f%bT++0leidJ4!%or6nz^Spfef2B{TL! z-Kxeyw!L-Z)_b^TJF+}{<;q_vyU)BPHlQMD5#E8p(ngkSc)%_HBzE_fKeC-hybUz2 zIHk7gWDB!3;u>XyQr!zPllKgrM*d!U*}u;Hopi(hfOh`1{l~D^zijjW%{IW_v;PM8 CZi=k{ literal 0 HcmV?d00001 diff --git a/assets/swe_bench_lite.svg b/assets/swe_bench_lite.svg new file mode 100644 index 000000000..4da3fc0fd --- /dev/null +++ b/assets/swe_bench_lite.svg @@ -0,0 +1,1632 @@ + + + + + + + + 2024-05-22T15:20:34.149598 + image/svg+xml + + + Matplotlib v3.9.0, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmark/swe_bench_lite.py b/benchmark/swe_bench_lite.py index 3a99428ae..cdbbf8227 100644 --- a/benchmark/swe_bench_lite.py +++ b/benchmark/swe_bench_lite.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt -from matplotlib import rc from imgcat import imgcat +from matplotlib import rc + def plot_swe_bench_lite(data_file): with open(data_file, "r") as file: @@ -22,30 +23,37 @@ def plot_swe_bench_lite(data_file): rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10}) - fig, ax = plt.subplots(figsize=(10, 5)) + fig, ax = plt.subplots(figsize=(10, 6)) ax.grid(axis="y", zorder=0, lw=0.2) for spine in ax.spines.values(): spine.set_edgecolor("#DDDDDD") spine.set_linewidth(0.5) - colors = [ - "#b3e6a8" if "Aider" in model else "#b3d1e6" for model in models - ] + colors = ["#b3e6a8" if "Aider" in model else "#b3d1e6" for model in models] bars = ax.bar(models, pass_rates, color=colors, alpha=0.75, zorder=3) for bar in bars: yval = bar.get_height() - ax.text(bar.get_x() + bar.get_width()/2, yval + 0.5, f'{yval}%', ha='center', va='bottom', fontsize=12, alpha=0.75) + ax.text( + bar.get_x() + bar.get_width() / 2, + yval + 0.5, + f"{yval}%", + ha="center", + va="bottom", + fontsize=12, + alpha=0.75, + ) - #ax.set_xlabel("Models", fontsize=18) + # ax.set_xlabel("Models", fontsize=18) ax.set_ylabel("Pass rate (%)", fontsize=18) - ax.set_title("SWE Bench Lite pass rates", fontsize=20) + ax.set_title("SWE Bench Lite", fontsize=20) ax.set_ylim(0, 30) - plt.xticks(rotation=45, ha='right', fontsize=16) + plt.xticks(rotation=45, ha="right", fontsize=16) plt.tight_layout(pad=3.0) - plt.savefig("swe_bench_lite.png") + plt.savefig("swe_bench_lite.jpg") plt.savefig("swe_bench_lite.svg") imgcat(fig) + # Example usage plot_swe_bench_lite("benchmark/tmp.txt")