shithub: amd64-simd

Download patch

ref: 499e44ebfde8c649d48d4c05093a8e1819be5349
parent: cdfd06439c4c4354e9fdc4f4124149a8d7abdfe5
author: rodri <rgl@antares-labs.eu>
date: Sat Dec 2 17:12:36 EST 2023

add a readme.

--- /dev/null
+++ b/readme.md
@@ -1,0 +1,223 @@
+# Running a benchmark
+
+	% mk pulldeps && mk; cd bench9 && mk; cd ../bench && mk && 6.out
+
+# Results so far
+
+The plan 9 system already uses SSE instructions for FP operations, but they are all based on the `*SD` (scalar double-precision) subset. The tests suffixed by `_sse` below use instructions from the `*PD` (packed double-precision) subset, which perform actual SIMD operations over the data using the 128-bit XMM registers. Those suffixed by `_avx` do the same but using AVX instructions that operate with the 256-bit wide YMM registers, even when performing 128-bit operations. The trailing `_a` means the tests are run loading from memory-aligned operands, conforming to the requirements of each instruction.
+
+## AMD Ryzen 3 1200
+
+```
+min
+                  op/s      98%      96%      75%      med      avg      min      max  
+fmin               361  3044124  2985684  2939465  2680777  2764144  2413529  3075293  
+fmin_sse           372  2896115  2779496  2703976  2677661  2687187  2478868  3335431  
+
+2d dot product
+                  op/s       98%       96%       75%       med       avg       min       max  
+dotvec2            287   3780198   3604559   3499310   3455915   3473738   3333691   3835117  
+dotvec2_sse         71  14434822  14355733  14107845  14021745  14020526  13589009  14477322  
+dotvec2_sse4        71  14625530  14403252  13963576  13875276  13903761  13520649  14845119  
+dotvec2_avx         73  14076825  13895586  13650538  13555359  13579508  13187532  14766759  
+dotvec2_sse_a      377   2830075   2802426   2673797   2638162   2648806   2497038   2835175  
+dotvec2_sse4_a     376   2821095   2797976   2683037   2647902   2658576   2517738   2853045  
+dotvec2_avx_a      274   3914877   3809088   3664309   3625289   3646672   3480450   3929327  
+
+3d dot product
+                  op/s       98%       96%       75%       med       avg       min       max  
+dotvec3            224   4671051   4632761   4506772   4448877   4460460   4248664   4706200  
+dotvec3_sse4        61  16951792  16523715  16307767  16199238  16224074  15851190  17695886  
+dotvec3_avx         60  17148110  16819313  16621474  16501135  16533685  16026849  17921144  
+dotvec3_sse4_a     375   2871885   2793346   2701906   2641992   2661341   2493918   3019034  
+
+3d cross product
+                  op/s       98%       96%       75%       med       avg       min       max  
+crossvec3          103  10677972  10357675   9631911   9557251   9621273   9300243  10730622  
+crossvec3_sse      174   6159839   5905261   5786872   5727122   5746890   5454484   6776764  
+
+Pt2
+                  op/s       98%       96%       75%       med       avg       min       max  
+Pt2                256   4403353   4060095   3924147   3877597   3901523   3713348   4504072  
+Pt2b               353   3218812   2964274   2842645   2798780   2829463   2542988   4211884  
+
+multiply + add
+                  op/s       98%       96%       75%       med       avg       min       max  
+madd               374   2862275   2771706   2680487   2635897   2666921   2464438   4505202  
+fma_avx            377   2939275   2776556   2670487   2629482   2648270   2453488   3007364  
+
+2d point sum
+                  op/s       98%       96%       75%       med       avg       min       max  
+addpt2             139   7414438   7376269   7237740   7154955   7158222   6895843   7649357  
+addpt2_sse          77  13324941  13153272  12959924  12803215  12856458  12526487  14520431  
+addpt2_avx          74  14000655  13705698  13478190  13381560  13392993  13009893  14768289  
+
+3d point sum
+                  op/s       98%       96%       75%       med       avg       min       max  
+addpt3             110  10508294   9225684   9030185   8937311   9014848   8688758  13812397  
+addpt3_avx          74  13747217  13686208  13453110  13313956  13354783  13000193  14005195  
+
+```
+
+## AMD Ryzen 5 1600
+
+```
+min
+                  op/s      98%      96%      75%      med      avg      min      max  
+fmin               381  3078249  3078041  2952450  2423244  2620317  2290029  3078428  
+fmin_sse           441  2375831  2373577  2248324  2247143  2264664  2247073  2383068  
+
+2d dot product
+                  op/s       98%       96%       75%       med       avg       min       max  
+dotvec2            246   4152258   4151970   4149845   4025763   4058215   4024602   4152843  
+dotvec2_sse         90  11131322  11129793  11128671  11127301  11083286  11001502  11131660  
+dotvec2_sse4        90  11130389  11129466  11128572  11127614  11088122  11001512  11130697  
+dotvec2_avx         89  11303035  11273362  11263197  11262283  11230664  11135710  11317291  
+dotvec2_sse_a      385   2736729   2727328   2615625   2582562   2596861   2522457   2741514  
+dotvec2_sse4_a     413   2566356   2544933   2445620   2427428   2421156   2280628   2619497  
+dotvec2_avx_a      270   3794864   3793861   3670891   3668062   3696267   3666861   3796492  
+dotvec2_p          396   2637237   2629504   2513364   2503983   2524380   2486908   2642032  
+
+3d dot product
+                  op/s       98%       96%       75%       med       avg       min       max  
+dotvec3            205   4958347   4957275   4954992   4830662   4865361   4793732   4958407  
+dotvec3_sse4        77  13009365  13008769  13007558  13006531  12984386  12879694  13012780  
+dotvec3_avx         73  13546345  13545521  13544051  13543108  13514745  13416326  13549015  
+dotvec3_sse4_a     339   3103713   3091482   2987682   2908482   2943368   2884299   3104656  
+
+3d cross product
+                  op/s       98%       96%       75%       med       avg       min       max  
+crossvec3          104  10100627  10009077   9554613   9527666   9575022   9392093  10430234  
+crossvec3_sse      127   7956531   7914250   7908721   7793683   7853727   7780996   8882017  
+
+Pt2
+                  op/s       98%       96%       75%       med       avg       min       max  
+Pt2                249   4172470   4167526   4051644   4015052   4015541   3845771   4177711  
+Pt2b               277   3705885   3704515   3583610   3578537   3607662   3577415   3707930  
+
+multiply + add
+                  op/s       98%       96%       75%       med       avg       min       max  
+madd               364   2942989   2871234   2805496   2736987   2741434   2592991   2944081  
+fma_avx            370   2841929   2821290   2717142   2688333   2697838   2638349   2864404  
+
+2d point sum
+                  op/s       98%       96%       75%       med       avg       min       max  
+addpt2             130   7934552   7872437   7716697   7677146   7666291   7523143   7938354  
+addpt2_sse          95  10593092  10591841  10534997  10451245  10433652  10196594  10593618  
+addpt2_avx          95  10602751  10592873  10590342  10464975  10475812  10260238  10647474  
+
+3d point sum
+                  op/s       98%       96%       75%       med       avg       min       max  
+addpt3             107   9698450   9697567   9572244   9149533   9309953   8982729   9698768  
+addpt3_avx          92  10863383  10861795  10860365  10859209  10816886  10733246  10912534  
+
+```
+
+## AMD Ryzen 3 5400U
+
+> provided by **llamaa**
+
+```
+min
+                  op/s      98%      96%      75%      med      avg      min      max  
+fmin               494  2382435  2038636  2032938  2002998  2023068  1995669  2719499  
+fmin_sse           500  2006126  2003791  2001287  1999331  1999632  1995669  2054751  
+
+2d dot product
+                  op/s      98%      96%      75%      med      avg      min      max  
+dotvec2            484  2258333  2251627  2096242  2011714  2063398  1995689  2258482  
+dotvec2_sse        108  9269760  9267864  9265149  9261662  9253332  9230724  9271536  
+dotvec2_sse4       108  9261647  9256628  9250741  9245292  9245148  9231083  9262984  
+dotvec2_avx        108  9262076  9255101  9247438  9241221  9242259  9231502  9265459  
+dotvec2_sse_a      499  2007793  2003033  2000349  1999331  2001089  1995669  2250011  
+dotvec2_sse4_a     516  2003063  2000977  1999361  1992919  1937670  1746216  2003083  
+dotvec2_avx_a      500  2004989  2003312  2000758  1999371  1999161  1995669  2006156  
+
+3d dot product
+                  op/s       98%       96%       75%       med       avg       min       max  
+dotvec3            384   2609136   2605464   2601782   2599278   2598953   2591854   2613926  
+dotvec3_sse4        99  10018456  10017478  10011281  10007719  10001388   9979919  10018635  
+dotvec3_avx         95  10505566  10503021  10495956  10490224  10491056  10479233  10506145  
+dotvec3_sse4_a     500   2005158   2003492   1999850   1999276   1998858   1995669   2018011  
+
+3d cross product
+                  op/s       98%       96%       75%       med       avg       min       max  
+crossvec3          185   5412204   5406706   5400679   5395620   5395606   5377330   5417363  
+crossvec3_sse      235   4255667   4254091   4248154   4245394   4246084   4240829   4255677  
+
+Pt2
+                  op/s       98%       96%       75%       med       avg       min       max  
+Pt2                444   2294315   2271873   2252545   2248803   2252010   2245131   2310360  
+Pt2b               489   2079758   2073401   2061986   2058149   2043489   1995669   2081534  
+
+multiply + add
+                  op/s       98%       96%       75%       med       avg       min       max  
+madd               496   2016813   2014618   2012503   2010926   2014984   1999371   2504193  
+fma_avx            494   2040792   2037179   2033457   2028159   2021563   1995669   2041171  
+
+2d point sum
+                  op/s       98%       96%       75%       med       avg       min       max  
+addpt2             231   4341951   4336353   4329379   4325162   4325468   4315289   4367826  
+addpt2_sse         114   8758561   8745859   8719007   8701080   8699660   8633920   8759040  
+addpt2_avx         114   8748343   8740161   8718897   8705890   8705307   8669034   8748852  
+
+3d point sum
+                  op/s       98%       96%       75%       med       avg       min       max  
+addpt3             190   5260970   5259025   5254235   5252459   5251446   5238679   5262717  
+addpt3_avx         108   9261707   9256249   9229686   9195514   9200410   9131717   9280945  
+
+```
+
+## Intel Core i5-10300H
+
+> provided by **uramekus**
+
+```
+min
+                  op/s      98%      96%      75%      med      avg      min      max  
+fmin               283  4568225  4565163  4563893  3543909  3527765  2488743  4571387  
+fmin_sse           414  2424605  2420142  2417015  2414723  2415306  2409636  2426538  
+
+2d dot product
+                  op/s       98%       96%       75%       med       avg       min       max  
+dotvec2            267   3744010   3742209   3737007   3735923   3736518   3731135   3767934  
+dotvec2_sse         73  13698430  13695892  13693351  13692214  13692714  13690210  13698510  
+dotvec2_sse4        75  13283629  13280936  13278169  13277493  13277783  13275146  13286098  
+dotvec2_avx         75  13283761  13279452  13278088  13277284  13277534  13275028  13287380  
+dotvec2_sse_a      373   2881463   2669431   2667139   2665984   2679081   2627220   3743409  
+dotvec2_sse4_a     393   2569252   2568403   2563474   2531893   2540101   2488852   3467011  
+dotvec2_avx_a      342   2910797   2906958   2905773   2905043   2915722   2903536   3964388  
+
+3d dot product
+                  op/s       98%       96%       75%       med       avg       min       max  
+dotvec3            206   5304768   5210421   4784381   4760076   4833173   4756544   5677615  
+dotvec3_sse4        65  15355526  15353759  15352531  15351795  15351913  15349422  15357083  
+dotvec3_avx         65  15354566  15354048  15352757  15351779  15351973  15349416  15357971  
+dotvec3_sse4_a     337   2959474   2957670   2956259   2955527   2963219   2926466   3756403  
+
+3d cross product
+                  op/s       98%       96%       75%       med       avg       min       max  
+crossvec3           91  10986157  10983141  10971494  10968571  10969827  10962082  10989235  
+crossvec3_sse      143   7001828   7000719   6998587   6996428   6974455   6863747   7199104  
+
+Pt2
+                  op/s       98%       96%       75%       med       avg       min       max  
+Pt2                227   4415172   4408012   4398711   4397851   4399747   4381089   4559482  
+Pt2b               277   3608261   3607181   3604815   3603066   3603403   3599702   3609269  
+
+multiply + add
+                  op/s       98%       96%       75%       med       avg       min       max  
+madd               347   2878112   2877760   2876937   2876112   2876230   2874718   2883503  
+fma_avx            341   2938089   2937966   2936871   2935967   2926577   2802398   2938710  
+
+2d point sum
+                  op/s       98%       96%       75%       med       avg       min       max  
+addpt2             120   8314705   8311729   8310052   8308082   8308210   8285693   8353482  
+addpt2_sse          89  11204824  11203825  11203201  11202634  11202605  11201048  11205531  
+addpt2_avx          73  13696975  13696348  13695183  13694241  13694325  13691704  13697637  
+
+3d point sum
+                  op/s       98%       96%       75%       med       avg       min       max  
+addpt3              94  10560908  10537671  10530696  10525254  10528137  10520659  10579957  
+addpt3_avx          77  12873349  12868526  12867472  12866408  12868654  12863977  13081995
+```