ref: 07398654cc5684fec2ff81fc1b21c3862536291c
parent: bb95b76619b6ce71052a4669bbe545d3a5544afa
author: eli <eli@owl>
date: Thu Jul 17 10:50:59 EDT 2025
version 2? quantized?
--- a/llama2.c
+++ b/llama2.c
@@ -18,6 +18,8 @@
//#endif
#define int8_t char
+#define uint8_t uchar
+#define int32_t int
#define ssize_t uvlong
#define size_t ulong
#define EXIT_FAILURE "exits"
@@ -28,7 +30,11 @@
#define powf pow
#define cosf cos
#define sinf sin
+#define uint32_t uint
+unsigned char quantized8 = 0;
+int GS = 32;
+
// ----------------------------------------------------------------------------
// Transformer model
@@ -44,6 +50,11 @@
#define SIZEOFCONFIG 24
+typedef struct {
+ int8_t* q; // quantized values
+ float* s; // scaling factors
+} QuantizedTensor;
+
int read4(int fd) {
typedef union _result {
char buf[4];
@@ -79,7 +90,7 @@
float* wcls;
} TransformerWeights;
-#define SIZEOFTRANSFORMERWEIGHTS (12*sizeof(float*))
+#define SIZEOFTRANSFORMERWEIGHTS (12*sizeof(void*))
typedef struct {
// current wave of activations
@@ -88,6 +99,8 @@
float *xb2; // an additional buffer just for convenience (dim,)
float *hb; // buffer for hidden dimension in the ffn (hidden_dim,)
float *hb2; // buffer for hidden dimension in the ffn (hidden_dim,)
+ QuantizedTensor *xq; // quantized x (dim,)
+ QuantizedTensor *hq; // quantized hb (hidden_dim,)
float *q; // query (dim,)
float *k; // key (dim,)
float *v; // value (dim,)
@@ -113,6 +126,8 @@
#define SIZEOFTRANSFORMER (SIZEOFCONFIG+SIZEOFTRANSFORMERWEIGHTS+SIZEOFRUNSTATE+4+sizeof(float*)+sizeof(ssize_t))
void malloc_run_state(RunState* s, Config* p) {
+ QuantizedTensor *xq;
+ QuantizedTensor *hq;
// we calloc instead of malloc to keep valgrind happy
int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;
s->x = calloc(p->dim, sizeof(float));
@@ -120,7 +135,17 @@
s->xb2 = calloc(p->dim, sizeof(float));
s->hb = calloc(p->hidden_dim, sizeof(float));
s->hb2 = calloc(p->hidden_dim, sizeof(float));
+ xq = calloc(1, sizeof(QuantizedTensor));
+ hq = calloc(1, sizeof(QuantizedTensor));
+ xq->q = calloc(p->dim, sizeof(int8_t));
+ xq->s = calloc(p->dim, sizeof(float));
+ hq->q = calloc(p->hidden_dim, sizeof(int8_t));
+ hq->s = calloc(p->hidden_dim, sizeof(float));
+ s->xq = xq;
+ s->hq = hq;
s->q = calloc(p->dim, sizeof(float));
+ s->k = calloc(kv_dim, sizeof(float));
+ s->v = calloc(kv_dim, sizeof(float));
s->key_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));
s->value_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));
s->att = calloc(p->n_heads * p->seq_len, sizeof(float));
@@ -127,6 +152,7 @@
s->logits = calloc(p->vocab_size, sizeof(float));
// ensure all mallocs went fine
if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q
+ || !s->k || !s->v || !s->xq || !s->hq || !s->hq->s || !s->hq->q
|| !s->key_cache || !s->value_cache || !s->att || !s->logits) {
fprintf(stderr, "malloc failed!\n");
exit(EXIT_FAILURE);
@@ -139,13 +165,132 @@
free(s->xb2);
free(s->hb);
free(s->hb2);
+ free(s->xq->q);
+ free(s->xq->s);
+ free(s->hq->q);
+ free(s->hq->s);
+ free(s->xq);
+ free(s->hq);
free(s->q);
- free(s->att);
+// free(s->k);
+// free(s->v);
+// free(s->att);
free(s->logits);
free(s->key_cache);
free(s->value_cache);
}
+float round(float in) {
+ float f;
+
+ f = fmod(in, 1.0);
+
+ if (in > 0) {
+ if (f < 0.5)
+ return floor(in);
+ return ceil(in);
+ }
+
+ if (f > -0.5)
+ return ceil(in);
+ return floor(in);
+}
+
+// ----------------------------------------------------------------------------
+// Quantization functions
+
+void dequantize(QuantizedTensor *qx, float* x, int n, int m) {
+ for (int j = 0; j < m; j++) {
+ for (int i = 0; i < n; i++) {
+ x[j * n + i] = qx->q[j * n + i] * qx->s[(j * n + i) / GS];
+ }
+ }
+}
+
+void quantize(QuantizedTensor *qx, float* x, int n) {
+ int num_groups = n / GS;
+ float Q_MAX = 127.0f;
+
+ for (int group = 0; group < num_groups; group++) {
+
+ // find the max absolute value in the current group
+ float wmax = 0.0;
+ for (int i = 0; i < GS; i++) {
+ float val = fabs(x[group * GS + i]);
+ if (val > wmax) {
+ wmax = val;
+ }
+ }
+
+ // calculate and write the scaling factor
+ float scale = wmax / Q_MAX;
+ qx->s[group] = scale;
+
+ // calculate and write the quantized values
+ for (int i = 0; i < GS; i++) {
+ float quant_value = x[group * GS + i] / scale; // scale
+ int8_t quantized = (int8_t) round(quant_value); // round and clamp
+ qx->q[group * GS + i] = quantized;
+ }
+ }
+}
+
+/* initialize `n` x quantized tensor (with `size_each` elements), starting from memory pointed at *ptr */
+QuantizedTensor *init_quantized_tensors(void **ptr, int n, int size_each) {
+ void *p = *ptr;
+ QuantizedTensor *res = malloc(n * sizeof(QuantizedTensor));
+ for(int i=0; i<n; i++) {
+ /* map quantized int8 values*/
+ res[i].q = (int8_t*)p;
+ p = (int8_t*)p + size_each;
+ /* map scale factors */
+ res[i].s = (float*)p;
+ p = (float*)p + size_each / GS;
+ }
+ *ptr = p; // advance ptr to current position
+ return res;
+}
+
+void memory_map_weights_q8(TransformerWeights *w, Config* p, void* ptr, uint8_t shared_classifier) {
+ QuantizedTensor *q_tokens, *wq, *wk, *wv, *wo, *w1, *w2, *w3, *wcls;
+
+ int head_size = p->dim / p->n_heads;
+ // first are the parameters that are kept in fp32 (the rmsnorm (1D) weights)
+ float* fptr = (float*) ptr; // cast our pointer to float*
+ w->rms_att_weight = fptr;
+ fptr += p->n_layers * p->dim;
+ w->rms_ffn_weight = fptr;
+ fptr += p->n_layers * p->dim;
+ w->rms_final_weight = fptr;
+ fptr += p->dim;
+
+ // now read all the quantized weights
+ ptr = (void*)fptr; // now cast the pointer back to void*
+ q_tokens = init_quantized_tensors(&ptr, 1, p->vocab_size * p->dim);
+ // dequantize token embedding table
+ w->token_embedding_table = malloc(p->vocab_size * p->dim * sizeof(float));
+ dequantize(q_tokens, w->token_embedding_table, p->vocab_size * p->dim, 1);
+
+ wq = init_quantized_tensors(&ptr, p->n_layers, p->dim * (p->n_heads * head_size));
+ dequantize(wq, w->wq, p->dim * (p->n_heads * head_size), p->n_layers);
+ wk = init_quantized_tensors(&ptr, p->n_layers, p->dim * (p->n_kv_heads * head_size));
+ dequantize(wk, w->wk, p->dim * (p->n_kv_heads * head_size), p->n_layers);
+ wv = init_quantized_tensors(&ptr, p->n_layers, p->dim * (p->n_kv_heads * head_size));
+ dequantize(wv, w->wv, p->dim * (p->n_kv_heads * head_size), p->n_layers);
+ wo = init_quantized_tensors(&ptr, p->n_layers, (p->n_heads * head_size) * p->dim);
+ dequantize(wo, w->wo, (p->n_heads * head_size) * p->dim, p->n_layers);
+
+ w1 = init_quantized_tensors(&ptr, p->n_layers, p->dim * p->hidden_dim);
+ dequantize(w1, w->w1, p->dim * p->hidden_dim, p->n_layers);
+ w2 = init_quantized_tensors(&ptr, p->n_layers, p->hidden_dim * p->dim);
+ dequantize(w2, w->w2, p->hidden_dim * p->dim, p->n_layers);
+ w3 = init_quantized_tensors(&ptr, p->n_layers, p->dim * p->hidden_dim);
+ dequantize(w3, w->w3, p->dim * p->hidden_dim, p->n_layers);
+
+ wcls = shared_classifier ? q_tokens : init_quantized_tensors(&ptr, 1, p->dim * p->vocab_size);
+ dequantize(wcls, w->wcls, p->dim * p->vocab_size, 1);
+}
+
void memory_map_weights(TransformerWeights *w, Config* p, float* ptr, int shared_weights) {
int head_size = p->dim / p->n_heads;
// make sure the multiplications below are done in 64bit to fit the parameter counts of 13B+ models
@@ -184,21 +329,23 @@
int ret;
int fdt;
Dir *dstat;
+ unsigned int magic;
+ int header_size = 28;
+ uint8_t shared_classifier;
+ int group_size;
+
fdt = open(checkpoint, OREAD);
if (fdt < 3) { fprintf(stderr, "Couldn't open file %s\n", checkpoint); exit(EXIT_FAILURE); }
-/*
-typedef struct {
- int dim; // transformer dimension
- int hidden_dim; // for ffn layers
- int n_layers; // number of layers
- int n_heads; // number of query heads
- int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
- int vocab_size; // vocabulary size, usually 256 (byte-level)
- int seq_len; // max sequence length
-} Config;
-*/
+ if (read(fdt, &magic, 4) != 4) { exits("read magic"); }
+ if (magic == 0x616b3432) {
+ if (read(fdt, &magic, 4) != 4) { exits("read version"); }
+ if (magic != 2) { exits("version (quantized) is not 2"); }
+ quantized8 = 1;
+ header_size = 256;
+ magic = read4(fdt);
+ }
// read in the config header
- config->dim = read4(fdt);
+ config->dim = magic;
config->hidden_dim = read4(fdt);
config->n_layers = read4(fdt);
config->n_heads = read4(fdt);
@@ -206,6 +353,12 @@
config->vocab_size = read4(fdt);
config->seq_len = read4(fdt);
+ if (quantized8 == 1) {
+ if (read(fdt, &shared_classifier, 1) != 1) exits("read shared_classifier");
+ if (read(fdt, &group_size, 4) != 4) exits("read group_size");
+ GS = group_size;
+ }
+
// negative vocab size is hacky way of signaling unshared weights. bit yikes.
int shared_weights = config->vocab_size > 0 ? 1 : 0;
config->vocab_size = abs(config->vocab_size);
@@ -231,8 +384,11 @@
}
close(*fd);
*fd = open(checkpoint, OREAD);
- float* weights_ptr = (float*)((char*)(*data) + 28);
- memory_map_weights(weights, config, weights_ptr, shared_weights);
+ float* weights_ptr = (float*)((char*)(*data) + header_size);
+ if (quantized8 == 0)
+ memory_map_weights(weights, config, weights_ptr, shared_weights);
+ else
+ memory_map_weights_q8(weights, config, weights_ptr, shared_classifier);
}
void build_transformer(Transformer *t, char* checkpoint_path) {
@@ -1008,8 +1164,7 @@
exit(EXIT_FAILURE);
}
-int main(int argc, char *argv[]) {
-
+void main(int argc, char *argv[]) {
// default parameters
char *checkpoint_path = NULL; // e.g. out/model.bin
char *tokenizer_path = "tokenizer.bin";
@@ -1073,6 +1228,7 @@
free_sampler(&sampler);
free_tokenizer(&tokenizer);
free_transformer(&transformer);
- return 0;
+
+ exits(nil);
}
--
⑨