ref: 74e5f814dd6404ee4fde1e6e5be6fa8b8c7da48a
parent: 14ce7d4eedd90cc6f6bcdb630f3ebbb843288067
author: cancel <cancel@cancel.fm>
date: Fri Nov 30 18:08:36 EST 2018
Add likely macro, force-no-inline on some sim stuff Makes compilation a lot faster, and also makes runtime a lot faster on my computer. Probably due to fragile behavior of inlining on the big switch statements, but still worth doing. -Os without -flto on clang is a slightly smaller binary that's just as fast, but I don't know how fragile that is to change and just happens to be good today. Need to get a workflow for going through and looking at the output asm on Linux.
--- a/base.h
+++ b/base.h
@@ -34,6 +34,12 @@
#define ORCA_ASSUME_ALIGNED(_ptr, _alignment) (_ptr)
#endif
+#if defined(__GNUC__) || defined(__clang__)
+#define ORCA_LIKELY(_x) __builtin_expect(_x, 1)
+#else
+#define ORCA_LIKELY(_x) (_x)
+#endif
+
#define ORCA_Y_MAX UINT16_MAX
#define ORCA_X_MAX UINT16_MAX
--- a/sim.c
+++ b/sim.c
@@ -2,16 +2,6 @@
#include "mark.h"
#include "sim.h"
-#if 0
-ORCA_FORCE_STATIC_INLINE void stupid_memcpy(char* restrict dest,
- char* restrict src, size_t sz) {
- for (size_t i = 0; i < sz; ++i) {
- dest[i] = src[i];
- }
-}
-#define ORCA_MEMCPY(_dest, _src, _sz) memcpy(_dest, _src, _sz)
-#endif
-
//////// Utilities
static Glyph const indexed_glyphs[] = {
@@ -55,8 +45,8 @@
// todo check if these inlines are actually being inlinded -- might be bad,
// should probably mark them not inlined
-static inline bool oper_has_neighboring_bang(Gbuffer gbuf, Usz h, Usz w, Usz y,
- Usz x) {
+static bool oper_has_neighboring_bang(Gbuffer gbuf, Usz h, Usz w, Usz y,
+ Usz x) {
return gbuffer_peek_relative(gbuf, h, w, y, x, 0, 1) == '*' ||
gbuffer_peek_relative(gbuf, h, w, y, x, 0, -1) == '*' ||
gbuffer_peek_relative(gbuf, h, w, y, x, 1, 0) == '*' ||
@@ -63,10 +53,10 @@
gbuffer_peek_relative(gbuf, h, w, y, x, -1, 0) == '*';
}
-static inline void oper_move_relative_or_explode(Gbuffer gbuf, Mbuffer mbuf,
- Usz height, Usz width,
- Glyph moved, Usz y, Usz x,
- Isz delta_y, Isz delta_x) {
+static ORCA_FORCE_NO_INLINE void
+oper_move_relative_or_explode(Gbuffer gbuf, Mbuffer mbuf, Usz height, Usz width,
+ Glyph moved, Usz y, Usz x, Isz delta_y,
+ Isz delta_x) {
Isz y0 = (Isz)y + delta_y;
Isz x0 = (Isz)x + delta_x;
if (y0 >= (Isz)height || x0 >= (Isz)width || y0 < 0 || x0 < 0) {
@@ -96,8 +86,9 @@
} Oper_bank_read_params;
// static may cause warning if programmer doesn't use bank storage
-void oper_bank_store(Oper_bank_write_params* bank_params, Usz width, Usz y,
- Usz x, I32* restrict vals, Usz num_vals) {
+void ORCA_FORCE_NO_INLINE oper_bank_store(Oper_bank_write_params* bank_params,
+ Usz width, Usz y, Usz x,
+ I32* restrict vals, Usz num_vals) {
assert(num_vals > 0);
Usz index = y * width + x;
assert(index < ORCA_BANK_INDEX_MAX);
@@ -104,8 +95,9 @@
bank_params->size =
bank_append(bank_params->bank, bank_params->size, index, vals, num_vals);
}
-Usz oper_bank_load(Oper_bank_read_params* bank_params, Usz width, Usz y, Usz x,
- I32* restrict out_vals, Usz out_count) {
+Usz ORCA_FORCE_NO_INLINE oper_bank_load(Oper_bank_read_params* bank_params,
+ Usz width, Usz y, Usz x,
+ I32* restrict out_vals, Usz out_count) {
Usz index = y * width + x;
assert(index < ORCA_BANK_INDEX_MAX);
return bank_read(bank_params->bank->data, bank_params->size,
@@ -166,7 +158,7 @@
(void)Tick_number; \
(void)bank_params;
-#define OPER_PHASE_SPEC static inline
+#define OPER_PHASE_SPEC static ORCA_FORCE_NO_INLINE
#define BEGIN_SOLO_PHASE_0(_oper_name) \
OPER_PHASE_SPEC void oper_phase0_##_oper_name(OPER_PHASE_0_COMMON_ARGS) { \
@@ -829,7 +821,7 @@
Glyph* glyph_row = gbuf + iy * width;
for (Usz ix = 0; ix < width; ++ix) {
Glyph glyph_char = glyph_row[ix];
- if (glyph_char == '.')
+ if (ORCA_LIKELY(glyph_char == '.'))
continue;
U8 cell_flags = mbuffer_peek(mbuf, height, width, iy, ix) &
(Mark_flag_lock | Mark_flag_sleep);
@@ -847,7 +839,7 @@
Glyph* glyph_row = gbuf + iy * width;
for (Usz ix = 0; ix < width; ++ix) {
Glyph glyph_char = glyph_row[ix];
- if (glyph_char == '.')
+ if (ORCA_LIKELY(glyph_char == '.'))
continue;
if (mbuffer_peek(mbuf, height, width, iy, ix) &
(Mark_flag_lock | Mark_flag_sleep))