ref: 5d8816881fb5e701947896d65e09685f2b921462
parent: 61e263c3f236996c12b4ad09f63593305169d6bd
author: Sigrid Solveig Haflínudóttir <sigrid@ftrv.se>
date: Tue Oct 15 23:46:04 EDT 2024
cmprocess (plan9/arm64): make it faster for identity and brightness-only cases
--- a/cmprocess_arm64.s
+++ b/cmprocess_arm64.s
@@ -1,12 +1,51 @@
+#include "colormatrix.h"
+
TEXT cmprocess(SB), $0
MOV in+8(FP), R1
MOV out+16(FP), R2
MOVW cnt+24(FP), R3
+ MOVW cmkind(SB), R4
+ CMPW $CmIdent, R4
+ BEQ _done
+
+ CMPW $CmBright, R4
+ BNE _full
+
+/* just the brightness */
+ WORD $0x4f0797fc // orr v28.8h, 0xff, lsl 0
+ WORD $0x4f0777fd // orr v29.4s, 0xff, lsl 24
+ WORD $0x4d40c400 // ld1r {v0.8h}, [x0]
+ ORRW $3, R3
+ ADDW $1, R3
+_brightconv:
+ WORD $0x0cdfa821 // ld1 {v1.2s, v2.2s}, [x1], 16
+ WORD $0x2f08a421 // uxtl v1.8h, v1.8b
+ WORD $0x2f08a442 // uxtl v2.8h, v2.8b
+ WORD $0x2f40a023 // umull v3.4s, v1.4h, v0.h[0]
+ WORD $0x6f40a024 // umull2 v4.4s, v1.8h, v0.h[0]
+ WORD $0x2f40a045 // umull v5.4s, v2.4h, v0.h[0]
+ WORD $0x6f40a046 // umull2 v6.4s, v2.8h, v0.h[0]
+ WORD $0x0f148463 // shrn v3.4h, v3.4s, 12
+ WORD $0x4f148483 // shrn2 v3.8h, v4.4s, 12
+ WORD $0x0f1484a4 // shrn v4.4h, v5.4s, 12
+ WORD $0x4f1484c4 // shrn2 v4.8h, v6.4s, 12
+ WORD $0x4e7c6c63 // smin.8h v3, v3, v28
+ WORD $0x4e7c6c84 // smin.8h v4, v4, v28
+ WORD $0x0e212862 // xtn v2.8b, v3.8h
+ WORD $0x4e212882 // xtn2 v2.16b, v4.8h
+ WORD $0x4ebd1c42 // orr.16b v2, v2, v29
+ WORD $0x4c9f7042 // st1.16b {v2}, [x2], 16
+ SUBW $4, R3, R3
+ CBNZW R3, _brightconv
+ B _done
+
+/* full-on multiplication */
+_full:
+ WORD $0x4f0707fe // movi v30.4s, 0xff, lsl 0
WORD $0x4c40a000 // ld1.16b {v0, v1}, [x0]
WORD $0x6e3d1fbd // eor.16b v29, v29, v29
- WORD $0x4f0707fe // movi v30.4s, 0xff, lsl 0
-conv:
+_fullconv:
WORD $0x0ddf803f // ld1 {v31.s}[0], [x1], 4
WORD $0x0f0777ff // orr v31.2s, 0xff, lsl 24
WORD $0x2f08a7ff // uxtl.8h v31, v31
@@ -28,8 +67,8 @@
WORD $0x0e612842 // xtn v2.4h, v2.4s
WORD $0x0e212842 // xtn v2.8b, v2.8h
WORD $0x0d9f8042 // st1.s {v2}[0], [x2], 4
-
SUBW $1, R3, R3
- CBNZW R3, conv
+ CBNZW R3, _fullconv
+_done:
RETURN
--- a/colormatrix.h
+++ b/colormatrix.h
@@ -1,11 +1,10 @@
/* cmkind values for optimized special-casing */
-enum {
- CmIdent, /* identity (== 1) */
- CmBright, /* only brightness is changed (> 1) */
-};
+#define CmIdent 0 /* identity (== 1) */
+#define CmBright 1 /* only brightness is changed (> 1) */
#define CM(v) ((v)*(1<<12))
+#ifdef QUAKE_GAME
extern s16int cm[4*4];
extern cvar_t v_saturation;
extern cvar_t v_contrast;
@@ -15,3 +14,4 @@
void cmsetvblend(float blend[4]);
void cmprocess(s16int cm[4*4], void *in, void *out, int n);
void cminit(void);
+#endif