From 7623b8e2e6d013c6d034deab4c0ab84d2039fc67 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Sat, 1 Jun 2024 07:03:58 -0700 Subject: [PATCH] copy --- _posts/2024-05-31-both-swe-bench.md | 4 +- assets/swe_bench.jpg | Bin 50697 -> 49765 bytes assets/swe_bench.svg | 314 +++++++++++----------------- benchmark/swe_bench_lite.py | 39 ++-- 4 files changed, 148 insertions(+), 209 deletions(-) diff --git a/_posts/2024-05-31-both-swe-bench.md b/_posts/2024-05-31-both-swe-bench.md index 3cddc6e59..095ddedaa 100644 --- a/_posts/2024-05-31-both-swe-bench.md +++ b/_posts/2024-05-31-both-swe-bench.md @@ -23,7 +23,7 @@ that was reported recently. [![SWE Bench results](/assets/swe_bench.svg)](https://aider.chat/assets/swe_bench.svg) Aider was benchmarked on 570 of the 2294 SWE Bench problems. -These are the same +These were the same [randomly selected 570 problems](https://github.com/CognitionAI/devin-swebench-results/tree/main/output_diffs) that [Devin used in their evaluation](https://www.cognition.ai/post/swe-bench-technical-report). Please see the [references](#references) @@ -251,7 +251,7 @@ In these cases aider with Opus was unable to produce any solutions. ## Computing the benchmark score -Benchmarking produced one proposed solution for each of +The benchmark harness produced one proposed solution for each of the 570 SWE Bench problems. A separate evaluation script was used to diff --git a/assets/swe_bench.jpg b/assets/swe_bench.jpg index 1796f2720fd944a33d9327cd26e1cfa900d96fab..4ce2881c3e638986917ec8731339aad1048af65f 100644 GIT binary patch delta 4855 zcmVyj05F_1F!_GlVBzge^fVDP}#s{xP6g=7dR&*kN^O3NhAsYnu^J~@g|#R zdo84ma??eRF=4cpK)D$iDx`EZEylm2*~%ii)hzAh+8fK7Wkvw!=9 z05JMyfHi2Wn0!m4++Pbgf3H=gXECYR!5PO@4~?Yz16MEgD?Kh0n)Y>x9z}@>1ny=y z00GSaTB4y#s9x$@Re#^as?LNEKy2Xqjt@^tnt%!_1iiPC`fDYVHxQ!8>4Um08voQcLlt#e@8T`k2~y$AL48i zf%pSWw7#~!6B}tah-GMG`AEaB1am+ZD58o0D58o0D58o0D5xzjZSSIaEu`LwBgDr9 zVU(3&&!GmVpaP00paP0)k|a^Ik}y<2S(x+ybu|Tr{kErOmv)Y1f=*G&GUR&TfN}3Y z7HF(_yj7yx&m>Y_e}H3ck~odbXx@zAZ3pUV^jejq*Ya4$Fl&T~W@N$pjhqmA5y9y| z7O1K=dd{DJ9o&|;D2~pY0Hd;t|@)<)bGXu&Qt~ z>p&Y+RV_bOwABQL)X)38Zu4#Ai>`5v>NC@|SH1BbnWx^He>;h0`$C|Qq*l_)g*m|` z6pqG#HE61ucBf~nTN{fR#Ic4b+!ZRrs372j>}uTr6j4P06jggG>uo~c&$W}xi+d*R zqLuRTDSU^mT8FW zStI#>^df%>->mIhPJ7?7PXj95T23a2w*c@}l;g2dN921@1`PJMupN6=)Ajh%Y|Se| zAx1#a#-OBx^U+5_F`o5nQq>-B4{7s31Zk-1qA1}Gp}e+r$z$xJC-AHHei_m4uHNe6 z$zDkaGGW*(e}r`0pzUwO8oM^1Ydzhq)RAtQ*LRZHBWB@)$3w>)&<8yYg}#^K-5Sa@ z#k^X4;u76jUCFrTyAhI4xvE!DESh(Y3LHR^86#o*GNcH^k8r?#Rn_V`RhFv~M|lsJ zBW=jRiZUNi%0VB2s&|?UFQSbi$s@P4k-(f}6;#KvbPGCBn%vZL5>?LZu~=KXv%tm|Le*W%{R_J1J+=gU=D zNZMDUe-5Lbg8*X{v?H5Y*0qx^sdr|!4zWWV66{p@19^-(Hc9Ws1#LEmr#_o=r?tLq zZe8AJ_qWW9*j(>~TPzHC{B-HLNwXJ#)VPe(|YAwv4IE0c9H6^cx^xHd#=Y(B}BRO67vpD|%zFEjWh@cKO?kz?i2oq| zw@@Xm)G{%WZ{Cf#O`s|2Ny+GGz9`h8y^`j8$ga_?ZsLa0BWo_uqEz1|?Yl{AU~|xV zu@&0e_-{zHzZbCCD$cCTzcUp=dX-Xkf2ls83cGt_ZK&J*p6Wj`E$aJs5z4S&c=rN; zF>h`ayiwuXs5b$19Pzg0DoSh)G7lpKM+c@&Kg4rR_j*p79n7||UtBUvG)=X_sxxg3 z*s%a-tzKPr>8@)z&t9H}sHhP*wiL1qJHMHJn z)UIQ5xmPObha?`v4mlOlTX=U#v|ELW&2e)Wj(zOU<|xiLZYMu@^VjkeS@?HKuz4Qd z)Q6HY6FbW8!9ThS5iGC1e2tt(4+ zbK!kQ#I37c!+8tN@j|lVDEzlzv6Sd@v}Zi?U3d09fU~l+@;`B8$oA{KzCwcw&T?_@ znyoK}G-d1f3qqxsm@fK z9lf*DpIX1BOQ_oG+H`9ku&`VQTaWE4h}=FDkjMj)4o4wIFl(lL9>PocuU_0V^F`#! zOC7Q&qb_!z#8g%u9@6y7kF;9a?UXLXXOY~Z`Yz&jk70@c$?;X1T$F1o7?AS`^P!R&$R$@x0l!2mcM;&Vb&T8 zSfTRQO_?JBoi`R@SQFO(ayT{8cy8wI^TXD2-;i#Kks$?6>=HBbf08)>9zAJqygjAr zG9;Ha#hp{j+>$yHzuhHB+&zwKLI@(Zf;b?JV~RBjh@$~O=m4M(KY-uG;RrS3t6#m+ zUH}oNCNhTO%PLp+xIIrlm1`U8h0^?GBV9@jypo7m$a4@7DI0w`Ipe)=e_`q0*+WeG zKl&JO;yvy6x9`q#f1kb2T8i_-dQJVr^IY1OX}}W!E2_zXfLTERkUA1FCi2NZX?J%dQKYzuO324+Cq~?TbJSGx-|9N1v40$pYawk`C5<%c zk}(@JVT(TJQpYENaxg2R{{VzfL;FFP*n;L$GXo>-cP+bfE^x;uxHzfx4-aX2rN^Il zYr0aq8Ind#f9vSmNZdV+XakwCmr>SrZ5|uj>xeh$Z#BYOO|(kOBB(i2j&q!kr%K)N zMcu`{jifg7wYj{!irMF1y&AFPjPyKUA9~S`OtXgWE2yAUmKY7ZWk~^J$U8vk)kkjC zCGC~AqiH3r+$%H^3^Fj_&<7!@+sS8ld!w!W?DmuCe{$R~mf@M=WFP_`?_sh*8OI-5 z-tep|Z{bZXRpel8V|>Wv&ngZEdwLGlHKu{4Sw(Ri7Q_pL*(*Gb?#qm0a}sgM?^kWD z;reT3Na;@cYaA9Fcn_& zR<_dDe?;*ev?dE#^vNxRZTvV-G9>rqSP!jqviMU)nrS6_FZHZiL|I*9a(a@0?K$@q zZu>{m^(#niF6~lFcwJsN##AX@n8rGKbf69f?%rPvY0~NIkv^@aM-b0ey514-?nI#f z0KjW%OFP?N7~0xtGh0LCXzLzsB4--}m;eCFf6q$kuXL*|Mte!F;1Wq`Cz}HjM#lBv z^Z3*nhllj-PVH{)VwcZY{IKy{NfFzeTWVIP2P=z3}Fl zf2dxY%ZR4?Lm-U2S2D$w8NeYJ4(5P3=fA(wV%99lXVm4?pcfZ5xEWc88v|e~1zo(8 z$pb?pQr_SDCV7|b_bBqmDvlWlc5HL_5HVeBJ|EJc)26UXDE`ec-72vx zT>6$LB=9-Tc&0(&y(Zq?>Im&gTe!?qf3wQ&?t~GyaV3D{^fUp33f<|K{vDtDM*8Ki zm6ql?r7VtolHi5tq!HWk%`T3U-RXK>yFR;d4b9ZZ&W|Qxea(*~N1OxL9=RFjx^~ko z9qiem%UK-ToyX+)O8WD`Bb?P(d^e;)elKFPRhT)Ees(IXdaz9G2cDo%2N`b-f5n%C zE_IkS8^oFm=?qaUxQH*>2q33M&QD)j>%3b$w%!ueCW2RrRJUTU8x>_fppKPj=z4L| z~)NTwIk78&8 znT2E2wVw*kZ?C3hYna)ja}(w~@X8J|{3QPX5FFGudWGPDHH%A)Qbc>?e^t4XWlMKM zF5Qa`;z-9CARaTC?7!h3(!^F#t)NA+efIuaGL${}2b}lLbKa{zgftlhs@4Fkj99Ol z_)<=Cqc}sJ>xuxyn%?VB)I3n_s_GD>qc7Rxf^vZy$yU!DMtgMY$27Wh_kJV0vx4gS z>fSZdt?r83N@8hbkQ}iif1C_->CJT!X?Ae^uVf^BuO5C~j{GQI=Z*jzRayKsq+G#k z43>vwtU?${O6tmh0k?1@b94Ht# zJd=}49QxJXnFX<$a?#$(h7Yz!71Z#mT}{u2f3&Nsdw3+YRZC^t z50uRrjCum35IwU=Cx>q{1CEC{!LG9Z07BDsHjVCWvhl9Oc;sl>cGZVRh03N&$0O^jE6n-1hVziRk zZ9jJ`i7bVre|BeNAgL@)h_d&-M zRvSy-JHse-tw{d>X~5rWK6&}E5OKHrrz060p;@1mF!qU^H`}{W4Qc)uG8P1IOjA0$tCRZe`&Y6d*A8~$*Y@4Jf;tEDr{HS z9f-$M^Ao@oN_+iA(@ODWx1W0fw6}st1lK-nu@~K+oDzEuGHbJ+!`fZUGes?~R%?&7 zITA2FZUEc}Ia7cK7^r{Y6VWzKg2xMS50f?}apo>_^K;41@pbp04tK-@QPDLGDYWIE zQNH^%f8%bRO~*KhmdBvS3Ngr=kAzgcINcqIaPc7SjTFbFc-fJQdp zcdpJ$D|v5Vym<3G3h5i<{GqYCj)yx(=XA+?paNQ_GtZ|@c-B=9-Tc%`t=G|e*P+}c|dos-LH d8b)?y$T%g5&q3CJG^n%i!xsXR|HBvp|Jj+tf=>Vd delta 5854 zcmV<479r{7gae6;1F!_Ge^qTmP_@#mCB40u%D0FD@3gi#$3O}1Ko)4Kb85CW`dHND zxF2IGt|d@d5ZD0m&v92M0HUEca0}J5GOyYiGboR_P(Ua1BBr1MiYggyp@!lJBz#Lb zA~nG!urhvt4(Hp{Pz2OgMZbx(%d3@w2Dx@=3}J<&yEg-a^X=P=fA`KQ@_3g*ibjr2 zYfnc)Rh6z#5Jyl(XaiD;%e?U(kzisI-Y@TC^HOV0+7AT^M04SoMy}h^7?IgOolgolU7HkuYoOB%bpb6-rvuE)Jmu+bs z%(q{}sLawZeK)^iE29*_?r)&CVt20AqbG|^k_B?EaulImpGCju? zVr@#!4MyKjl670VfXx(GB(PDEeZLw2wM9d3bq(4(K>LC_e@5I8Rg~a@4NX7=6je{{ zJ3sAD?B6t-zc0)&gky#ECm0DX;m6`^5X~in*`u~0Ml(Rt*^ZQ0JF)z$d#d!`5JE^tO2P8f6OMNdEl6j4A06xVf)8Qp@ajFNf*RrLKr z*Hp7hyEzPyKizVSx%U|Zk9q*LMPtw6ofg`8A(r$56Ks*zH)KQHGl8@psjIQ-R&!sq zjB^IKSXO^(sa7391OgA%u22C* z6i@+0e^;}hDxQD}D58K0Dl4m)Y^^4`xP6er8%ZNzovH>& zz~`W+MJi7UOBvjbp_Jnp2a!MzqKYU0qKbm%_TJ5)S>)JfltUi`?s5qOj+_J5fG47g z8z=yxpqlpH-s;-gSvJjbSz=rhxTxHD?ZEF-lQF49lkay4e_Z#D27DcU5B)R$0MUzG z4OhOgw$!ZTw7ZYYf=q5*#AgH%$mggO0l@f&J|EI`9S!)gxhpoD$D;o6OT+km;p|VP zZKP9cnx?eV>X(*jV=#gk?dCCr0TsN)1Mg(9>&69dYI;qMvud{5g}ToYxRF65VTVEp zAk};C4rw}-f74vuT5gq;BeKZJ<^j+!QUedJXakVa?kB$1bj@brr)#63$#BTBva7PX zf%1clH(Yei4{9{~{aLN_tw~o`Y3^l-r+bOw7P2Fexm7pu!TE4Az|D5@X%;eGTHPCP zvsx)gSs1R>Kf{1PBZ0x-pTuhFs#UGR zj9>`7p^w8m{{ULVu-|XtKMLF?M2V;(Mthi~e|KDcHZVVcuCr0lG|g6Jn)26tfp%_J zWGuhkZKQ4<$0CyFOZ!FsoN*diS!P6uc+8mEoMQo07W{GPKn?L7*V%MgNVu8@kJUaWhq*}Z?!?j~fuG0F#*Hg56 ze_O~zu)Z>+k;pj&5y(02DujL((V}wcW<-q}#Li`M&a5y)9Obz6&O2g&Fl`d@!~PL_ z)2d%wNqos2%1tq7+%!zwFF-*9kti%_)9~$nptJA1j@zTM2-}<_qKz8-r#0{ zCGj=Pwtg4YE+8aJ9jt{|bq8tZ@atLEZ>MO{=*=Chw^#ANV6eA`s{*UI`T>A4$E9>P z7V+I%OK&pnic~1;>ZD+b#*f1m*A|v*sfU{8(lZ2e!*L{J$DT8^jz|lidH~E8e^BXG zQ)=EIP;O_xOQ`Lwa@dqE*wIJvx9OgHW}(zHtDQ>IR@JqJi%S#Qq!C8Vzsf~@;L1k? zmLOns6}tWu(j>Q7B#PzHWfLyz$%`w1Kq0r}a8G)g+r!#zs?4br(j?KC+2Fd7f1=aM^;Pg?FY?F&uPWO*#Dwn*v}yE{8=9TXNMf$TV|Gg#YCd2M$r zd#ASD9r?if6DJbKUuMS~c;X{>gBhB;fG-V05TuUD*jak{8Qfef8yTH?blFR zjXkD#;FO?t5&$!guWoyEtzA1$duz-4+lE%SyObZ3qQ;{Lzm`e&V^&R#vD8MP<^KSs z2#g{3AGk<3$4;P72LUFft6kaO>T#y7w^7e1h0sIPFtj%PyCz z_?K6Kr~6b|bTXuo~n`s(U?ggNu2EUE(#Fg<Q5BR%BB9 z#!BY|FHk#k>T6Y?pF`GkWP?+@yBD%FQ^#>JkT?iidV}|HKWZ6{qT#af&?jqDQI-rUIwLoUP&pd7Ftdp$YG#djKpnP;j+ z9Cwi!7AXX%L$Z}&oDful+|xg?>(5TYwngp2+xWqr=27-$e<+tfx*kHGm(!kj?^}P^ zdUy6w(>~AshFrOidwuQu^PK1JbJn7~@aB_$aWwZfrQUT5C=$A?fD8@F2nS521ve>80aSC>`2n&#s9E#z01_iY(= zOcd#y0CU$p>$SJitgUV%xP~3F$8NT&+ljz<<(B)W#9s_OTVOK+^~ zwu%smf2PclglCL7X57HB$-oCAis;+IdK=t4?P3eOre_96+U{F%EV;vSdyXo7L&N%B zsXw20Y+7Br3nYw%m(jM7xO*JX2Qg(XrL1Z?Bp0{W5P6z>vRo~+!9=XGCI!#(l9fBM#hdR>$^P+UcBsU#4XBHBm^86fQg zr|^#5s!Pi|Z9>vZTewnaBp7sY!JrOHP_~lB{(T`e3+e4{blIkLmf@M=WGjXb_kcMd zjN^*h@PrEogfx(^BW|>b&zR+~N&|tOgV1ypV$VR+EE09Mwjj=%BzKYBnSD1gCmi(6 ze_ge*S;4nt42)S%JieGclwQqkoyy2E!H#Q0`n%e(4W ze9KjCbz5CFEd#{1@O_@nO9^d^OmnmZ^CL|6

{I>1Ob*i#+nm_INV3=@BJ$M<=Od z1S!wBt9RNqovB#IWp!$gZ4)abLn;)ne@tT?JtzZ!xr%QK>5ptOC)KpE8TD(e_&<;D zzwz4Idn?O77g$+oQ(HvPX#QU7h?&%?$fKZT=cjt<@3gBeN?Tbj;1a#8qG(AZY;n-= zdHNcIP|!5(PVFwPqLj{w<*UVUBu8#@kCBFXpb7N!)KWJZjP|m=enW2yMgaHAe>Q8L zxJ^m?dw2btJ*+MLoeuMH3rGm_Bo z$oYu2LiADy9X}k>=?gvXlc(#~8tuis&CKe}8%dZ5FOMWg)2GXxx#qfdf70yOSpyD! z#6h?DkH~hF_2+^|IjYh4Z%B(!`*K;RoJeIKI~5LkuuSa-+zikM85}njeiYQTm^B;7 z=eNI^qKPHj5CU=tW$4-Gt#&>tnp=MgYO_Hj#VT7nBcl~%IO+)LQ?`evHkme@ZS!e6 zv&kOz`I(z>InH|X)~w!Ie?fm|HKodVkW7UnOp%a!jE-mnmV(yfSn(#MZ!W2S4ZJp% z%(1~aPo0&%Y>s<$=zHfCLfY$9)wGQgLve9(4Xd@yy`|WK;mB>GPNO4&F_FpdU91{? zt+m|OP{p=skrGDSPE|9sV4QK2RA0jSGuqnQn?k8-3c^xnRzOZjf7}>0J+nX@#A_a< zt9V*Ndw(mW*D*Pj%u)i1$7=;R+&<|)#0NDM{-Job&#c+pYLX}0RZ`{VSnb(Np*Ac# zi6a>qAQOz&WB&jM{*~le!nT1D&HTtVJ1Iu)at=;&&U*K%@_0u=m(AZ`3d+Tb`H!_& zoRXtBQ=aRCKp68|e_ZN1*NMaGx`Z)Wd45E2PEaEl6^Y~Dw@$osP19~}{6}-A+g;yW z!5GqEyJ2l9iKdas1&JKsW2dRErX4QM2fBFTA+CaniFP~os6o{6@_;z1vv_MrxVE*o zdqc9)KPnQEy0WG*w{Relz~Gty<|nkU1kY)ZBP`O1ZVSw|g~M?S>)p znWHg|;4+XvJ^AL6PY!7kY4KQ_B^L0$_`)+QD|cYP?#apLJkSMi3|#q^_jc_y>q&1u z){6TESaP6+e=G^yImkKhT$ZzBu*Kt<5?RY`gaGq1FU*m*LUF<1dJYE_-|6~go|gkd zX9$TIi?_^JvmWGw|#z#%3c2s)|3IL{{RyyllQYT@U6k*^*QAAJZ7@>yNygU(I6a$P@}H_pT}df=P`&p5_teKO8XQY}s32J$G(NiI%8 zlaevgfH;}_MSXXs-|H4q%p)<{T1K&)gm=S8z!SG{^A&P?ik>@-M#|>mDAQJk+Di%D zhNE(h>mrP+p$=QQ2e9J0t9=hlwbNzOqO*)ze@x8ML~_B49G5*yD+Xl6XRbxWODYRj5*~lZ;Ii=II zf14dCBwIK!jx`wC`s&$M$^uf_QyV$mCJsk99-TSn zvLn_tJGivHK1S3oV`jP!Et9HCaIs0TPU2Ss2PX$7k9zDcyg8+6H*IxwY+B)wP{^{n zHqo36sT^{1>))WK&!Fj-HZWPeqKGa3e*n~o$wMC)`B_LK@H7F($#<(80cLJ*tw7c7 z;D$+MMxA4PWnXC^;FFEn$i`}IPV)D|mm1BzrS;6nwh>%L*HRfwF48n*fjxld9XPJ0 zJp)UI)*EQ;0>upC#xX&PQ$mvOE4aSvw;ig?i@?mDO;|*A}ZGyNx5Ca~wLUC3w#mB=R|+4qD24>n(m;Had(j$)?LUmwR?5e|_r80|E)# z)bXFgil#;E9v1N4pe4H0ukJ2wT#2r>tXP|j|B109ibBxA*t! zZFaGc9^JX!9BRjE5w~bPfAgNS$@N`VM}t-JQ&5Hlvl2)Br7}w-QWpqVVVIS}XPz>( z*h!%2lAGT$(O6!UY1Fa9vh-|c0QBTw@kwc*X?8JOM-8AWaJyzQvq)rIV+64zj!zT; zfi9Rd-xc_D4NCIo%xUtsGkJuO5&4FCb>t2?71-8ApN2Gh3z%+gD6I<34t%$1U5Ww- oz{xydo;p^It*lmX2w~r_5`K9Z&nG9AB=?{RsIx%89|8jZ*--t5q5uE@ diff --git a/assets/swe_bench.svg b/assets/swe_bench.svg index 8abdd70a8..ffd8dbe1d 100644 --- a/assets/swe_bench.svg +++ b/assets/swe_bench.svg @@ -6,7 +6,7 @@ - 2024-05-31T11:41:49.017547 + 2024-06-01T07:02:59.687095 image/svg+xml @@ -41,12 +41,12 @@ z - - + @@ -527,7 +527,7 @@ z - + @@ -707,7 +707,7 @@ z - + @@ -813,7 +813,7 @@ z - + @@ -1017,7 +1017,7 @@ z - + @@ -1120,7 +1120,7 @@ z - + @@ -1154,7 +1154,7 @@ z - + @@ -1247,16 +1247,16 @@ z +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - - + @@ -1281,11 +1281,11 @@ z +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1301,11 +1301,11 @@ L 690 274.534192 +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1321,11 +1321,11 @@ L 690 242.032134 +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1341,11 +1341,11 @@ L 690 209.530076 +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1375,11 +1375,11 @@ z +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1396,11 +1396,11 @@ L 690 144.52596 +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1417,11 +1417,11 @@ L 690 112.023902 +" clip-path="url(#p22faac38c8)" style="fill: none; stroke: #b0b0b0; stroke-width: 0.2; stroke-linecap: square"/> - + @@ -1568,7 +1568,7 @@ L 170.425134 307.03625 L 170.425134 170.527606 L 104.863636 170.527606 z -" clip-path="url(#p1ec2c53f8e)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #b3d1e6; opacity: 0.3"/> +" clip-path="url(#p22faac38c8)" style="fill: #17965a; opacity: 0.9"/> +" clip-path="url(#p22faac38c8)" style="fill: #17965a; opacity: 0.9"/> @@ -2006,60 +2006,9 @@ z - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + diff --git a/benchmark/swe_bench_lite.py b/benchmark/swe_bench_lite.py index fe9489cb5..2151cf53c 100644 --- a/benchmark/swe_bench_lite.py +++ b/benchmark/swe_bench_lite.py @@ -6,7 +6,7 @@ from imgcat import imgcat from matplotlib import rc -def plot_swe_bench_lite(data_file): +def plot_swe_bench(data_file, is_lite): with open(data_file, "r") as file: lines = file.readlines() @@ -45,7 +45,7 @@ def plot_swe_bench_lite(data_file): for model, pass_rate, color in zip(models, pass_rates, colors): alpha = 0.9 if "Aider" in model else 0.3 hatch = "" - # if "lite" not in data_file: + # if is_lite: # hatch = "///" if "(570)" in model else "" bar = ax.bar(model, pass_rate, color=color, alpha=alpha, zorder=3, hatch=hatch) bars.append(bar[0]) @@ -69,7 +69,7 @@ def plot_swe_bench_lite(data_file): # ax.set_xlabel("Models", fontsize=18) ax.set_ylabel("Instances resolved (%)", fontsize=18, color=font_color) - if "lite" in data_file: + if is_lite: title = "SWE Bench Lite" else: title = "SWE Bench" @@ -80,21 +80,22 @@ def plot_swe_bench_lite(data_file): color=font_color, ) - # Add note at the bottom of the graph - note = ( - "Note: (570) and (2294) refer to the number of SWE Bench instances that were benchmarked." - ) - plt.figtext( - 0.5, - 0.05, - note, - wrap=True, - horizontalalignment="center", - fontsize=12, - color=font_color, - ) + if is_lite: + plt.tight_layout(pad=3.0) + else: + # Add note at the bottom of the graph + note = "(570) and (2294) denote the number of SWE Bench instances benchmarked" + plt.figtext( + 0.5, + 0.05, + note, + wrap=True, + horizontalalignment="center", + fontsize=12, + color=font_color, + ) - plt.tight_layout(pad=3.0, rect=[0, 0.05, 1, 1]) + plt.tight_layout(pad=3.0, rect=[0, 0.05, 1, 1]) out_fname = Path(data_file.replace("-", "_")) plt.savefig(out_fname.with_suffix(".jpg").name) @@ -104,4 +105,6 @@ def plot_swe_bench_lite(data_file): fname = sys.argv[1] -plot_swe_bench_lite(fname) +is_lite = "lite" in fname + +plot_swe_bench(fname, is_lite)