shithub: util

Download patch

ref: 07398654cc5684fec2ff81fc1b21c3862536291c
parent: bb95b76619b6ce71052a4669bbe545d3a5544afa
author: eli <eli@owl>
date: Thu Jul 17 10:50:59 EDT 2025

version 2? quantized?

--- a/llama2.c
+++ b/llama2.c
@@ -18,6 +18,8 @@
 //#endif
 
 #define int8_t char
+#define uint8_t uchar
+#define int32_t int
 #define ssize_t uvlong
 #define size_t ulong
 #define EXIT_FAILURE "exits"
@@ -28,7 +30,11 @@
 #define powf pow
 #define cosf cos
 #define sinf sin
+#define uint32_t uint
 
+unsigned char quantized8 = 0;
+int GS = 32;
+
 // ----------------------------------------------------------------------------
 // Transformer model
 
@@ -44,6 +50,11 @@
 
 #define SIZEOFCONFIG 24
 
+typedef struct {
+    int8_t* q;    // quantized values
+    float* s; // scaling factors
+} QuantizedTensor;
+
 int read4(int fd) {
 	typedef union _result {
 		char buf[4];
@@ -79,7 +90,7 @@
     float* wcls;
 } TransformerWeights;
 
-#define SIZEOFTRANSFORMERWEIGHTS (12*sizeof(float*))
+#define SIZEOFTRANSFORMERWEIGHTS (12*sizeof(void*))
 
 typedef struct {
     // current wave of activations
@@ -88,6 +99,8 @@
     float *xb2; // an additional buffer just for convenience (dim,)
     float *hb; // buffer for hidden dimension in the ffn (hidden_dim,)
     float *hb2; // buffer for hidden dimension in the ffn (hidden_dim,)
+	QuantizedTensor *xq; // quantized x (dim,)
+	QuantizedTensor *hq; // quantized hb (hidden_dim,)
     float *q; // query (dim,)
     float *k; // key (dim,)
     float *v; // value (dim,)
@@ -113,6 +126,8 @@
 #define SIZEOFTRANSFORMER (SIZEOFCONFIG+SIZEOFTRANSFORMERWEIGHTS+SIZEOFRUNSTATE+4+sizeof(float*)+sizeof(ssize_t))
 
 void malloc_run_state(RunState* s, Config* p) {
+	QuantizedTensor *xq;
+	QuantizedTensor *hq;
     // we calloc instead of malloc to keep valgrind happy
     int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;
     s->x = calloc(p->dim, sizeof(float));
@@ -120,7 +135,17 @@
     s->xb2 = calloc(p->dim, sizeof(float));
     s->hb = calloc(p->hidden_dim, sizeof(float));
     s->hb2 = calloc(p->hidden_dim, sizeof(float));
+	xq = calloc(1, sizeof(QuantizedTensor));
+	hq = calloc(1, sizeof(QuantizedTensor));
+	xq->q = calloc(p->dim, sizeof(int8_t));
+	xq->s = calloc(p->dim, sizeof(float));
+	hq->q = calloc(p->hidden_dim, sizeof(int8_t));
+	hq->s = calloc(p->hidden_dim, sizeof(float));
+	s->xq = xq;
+	s->hq = hq;
     s->q = calloc(p->dim, sizeof(float));
+	s->k = calloc(kv_dim, sizeof(float));
+	s->v = calloc(kv_dim, sizeof(float));
     s->key_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));
     s->value_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));
     s->att = calloc(p->n_heads * p->seq_len, sizeof(float));
@@ -127,6 +152,7 @@
     s->logits = calloc(p->vocab_size, sizeof(float));
     // ensure all mallocs went fine
     if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q
+     || !s->k || !s->v || !s->xq || !s->hq || !s->hq->s || !s->hq->q
      || !s->key_cache || !s->value_cache || !s->att || !s->logits) {
         fprintf(stderr, "malloc failed!\n");
         exit(EXIT_FAILURE);
@@ -139,13 +165,132 @@
     free(s->xb2);
     free(s->hb);
     free(s->hb2);
+    free(s->xq->q);                                                              
+    free(s->xq->s);                                                              
+    free(s->hq->q);                                                              
+    free(s->hq->s);
+	free(s->xq);
+	free(s->hq);
     free(s->q);
-    free(s->att);
+//  free(s->k);
+//  free(s->v);
+//  free(s->att);
     free(s->logits);
     free(s->key_cache);
     free(s->value_cache);
 }
 
+float round(float in) {
+	float f;
+
+	f = fmod(in, 1.0);
+
+	if (in > 0) {
+		if (f < 0.5)
+			return floor(in);
+		return ceil(in);
+	}
+
+	if (f > -0.5)
+		return ceil(in);
+	return floor(in);
+}
+
+// ----------------------------------------------------------------------------
+// Quantization functions
+
+void dequantize(QuantizedTensor *qx, float* x, int n, int m) {
+	for (int j = 0; j < m; j++) {
+	    for (int i = 0; i < n; i++) {
+	        x[j * n + i] = qx->q[j * n + i] * qx->s[(j * n + i) / GS];
+	    }
+	}
+}
+
+void quantize(QuantizedTensor *qx, float* x, int n) {
+    int num_groups = n / GS;
+    float Q_MAX = 127.0f;
+
+    for (int group = 0; group < num_groups; group++) {
+
+        // find the max absolute value in the current group
+        float wmax = 0.0;
+        for (int i = 0; i < GS; i++) {
+            float val = fabs(x[group * GS + i]);
+            if (val > wmax) {
+                wmax = val;
+            }
+        }
+
+        // calculate and write the scaling factor
+        float scale = wmax / Q_MAX;
+        qx->s[group] = scale;
+
+        // calculate and write the quantized values
+        for (int i = 0; i < GS; i++) {
+            float quant_value = x[group * GS + i] / scale; // scale
+            int8_t quantized = (int8_t) round(quant_value); // round and clamp
+            qx->q[group * GS + i] = quantized;
+        }
+    }
+}
+
+/* initialize `n` x quantized tensor (with `size_each` elements), starting from memory pointed at *ptr */
+QuantizedTensor *init_quantized_tensors(void **ptr, int n, int size_each) {
+    void *p = *ptr;
+    QuantizedTensor *res = malloc(n * sizeof(QuantizedTensor));
+    for(int i=0; i<n; i++) {
+        /* map quantized int8 values*/
+        res[i].q = (int8_t*)p;
+        p = (int8_t*)p + size_each;
+        /* map scale factors */
+        res[i].s = (float*)p;
+        p = (float*)p + size_each / GS;
+    }
+    *ptr = p; // advance ptr to current position
+    return res;
+}
+
+void memory_map_weights_q8(TransformerWeights *w, Config* p, void* ptr, uint8_t shared_classifier) {
+	QuantizedTensor *q_tokens, *wq, *wk, *wv, *wo, *w1, *w2, *w3, *wcls;
+
+    int head_size = p->dim / p->n_heads;
+    // first are the parameters that are kept in fp32 (the rmsnorm (1D) weights)
+    float* fptr = (float*) ptr; // cast our pointer to float*
+    w->rms_att_weight = fptr;
+    fptr += p->n_layers * p->dim;
+    w->rms_ffn_weight = fptr;
+    fptr += p->n_layers * p->dim;
+    w->rms_final_weight = fptr;
+    fptr += p->dim;
+
+    // now read all the quantized weights
+    ptr = (void*)fptr; // now cast the pointer back to void*
+    q_tokens = init_quantized_tensors(&ptr, 1, p->vocab_size * p->dim);
+    // dequantize token embedding table
+    w->token_embedding_table = malloc(p->vocab_size * p->dim * sizeof(float));
+    dequantize(q_tokens, w->token_embedding_table, p->vocab_size * p->dim, 1);
+
+    wq = init_quantized_tensors(&ptr, p->n_layers, p->dim * (p->n_heads * head_size));
+	dequantize(wq, w->wq, p->dim * (p->n_heads * head_size), p->n_layers);
+    wk = init_quantized_tensors(&ptr, p->n_layers, p->dim * (p->n_kv_heads * head_size));
+	dequantize(wk, w->wk, p->dim * (p->n_kv_heads * head_size), p->n_layers);
+    wv = init_quantized_tensors(&ptr, p->n_layers, p->dim * (p->n_kv_heads * head_size));
+	dequantize(wv, w->wv, p->dim * (p->n_kv_heads * head_size), p->n_layers);
+    wo = init_quantized_tensors(&ptr, p->n_layers, (p->n_heads * head_size) * p->dim);
+	dequantize(wo, w->wo, (p->n_heads * head_size) * p->dim, p->n_layers);
+
+    w1 = init_quantized_tensors(&ptr, p->n_layers, p->dim * p->hidden_dim);
+	dequantize(w1, w->w1, p->dim * p->hidden_dim, p->n_layers);
+    w2 = init_quantized_tensors(&ptr, p->n_layers, p->hidden_dim * p->dim);
+	dequantize(w2, w->w2, p->hidden_dim * p->dim, p->n_layers);
+    w3 = init_quantized_tensors(&ptr, p->n_layers, p->dim * p->hidden_dim);
+	dequantize(w3, w->w3, p->dim * p->hidden_dim, p->n_layers);
+
+    wcls = shared_classifier ? q_tokens : init_quantized_tensors(&ptr, 1, p->dim * p->vocab_size);
+	dequantize(wcls, w->wcls, p->dim * p->vocab_size, 1);
+}
+
 void memory_map_weights(TransformerWeights *w, Config* p, float* ptr, int shared_weights) {
     int head_size = p->dim / p->n_heads;
     // make sure the multiplications below are done in 64bit to fit the parameter counts of 13B+ models
@@ -184,21 +329,23 @@
 	int ret;
 	int fdt;
 	Dir *dstat;
+	unsigned int magic;
+	int header_size = 28;
+	uint8_t shared_classifier;
+	int group_size;
+
     fdt = open(checkpoint, OREAD);
     if (fdt < 3) { fprintf(stderr, "Couldn't open file %s\n", checkpoint); exit(EXIT_FAILURE); }
-/*
-typedef struct {
-    int dim; // transformer dimension
-    int hidden_dim; // for ffn layers
-    int n_layers; // number of layers
-    int n_heads; // number of query heads
-    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
-    int vocab_size; // vocabulary size, usually 256 (byte-level)
-    int seq_len; // max sequence length
-} Config;
-*/
+	if (read(fdt, &magic, 4) != 4) { exits("read magic"); }
+	if (magic == 0x616b3432) {
+		if (read(fdt, &magic, 4) != 4) { exits("read version"); }
+		if (magic != 2) { exits("version (quantized) is not 2"); }
+		quantized8 = 1;
+		header_size = 256;
+		magic = read4(fdt);
+	}
     // read in the config header
-    config->dim = read4(fdt);
+    config->dim = magic;
 	config->hidden_dim = read4(fdt);
 	config->n_layers = read4(fdt);
 	config->n_heads = read4(fdt);
@@ -206,6 +353,12 @@
 	config->vocab_size = read4(fdt);
 	config->seq_len = read4(fdt);
 
+	if (quantized8 == 1) {
+		if (read(fdt, &shared_classifier, 1) != 1) exits("read shared_classifier");
+		if (read(fdt, &group_size, 4) != 4) exits("read group_size");
+		GS = group_size;
+	}
+
     // negative vocab size is hacky way of signaling unshared weights. bit yikes.
     int shared_weights = config->vocab_size > 0 ? 1 : 0;
     config->vocab_size = abs(config->vocab_size);
@@ -231,8 +384,11 @@
 	}
 	close(*fd);
 	*fd = open(checkpoint, OREAD);
-    float* weights_ptr = (float*)((char*)(*data) + 28);
-    memory_map_weights(weights, config, weights_ptr, shared_weights);
+    float* weights_ptr = (float*)((char*)(*data) + header_size);
+	if (quantized8 == 0)
+	    memory_map_weights(weights, config, weights_ptr, shared_weights);
+	else
+		memory_map_weights_q8(weights, config, weights_ptr, shared_classifier);
 }
 
 void build_transformer(Transformer *t, char* checkpoint_path) {
@@ -1008,8 +1164,7 @@
     exit(EXIT_FAILURE);
 }
 
-int main(int argc, char *argv[]) {
-
+void main(int argc, char *argv[]) {
     // default parameters
     char *checkpoint_path = NULL;  // e.g. out/model.bin
     char *tokenizer_path = "tokenizer.bin";
@@ -1073,6 +1228,7 @@
     free_sampler(&sampler);
     free_tokenizer(&tokenizer);
     free_transformer(&transformer);
-    return 0;
+
+	exits(nil);
 }
 
--