shithub: util

--- a/llama2.c

+++ b/llama2.c

@@ -18,6 +18,8 @@

 //#endif

 #define int8_t char

+#define uint8_t uchar

+#define int32_t int

 #define ssize_t uvlong

 #define size_t ulong

 #define EXIT_FAILURE "exits"

@@ -28,7 +30,11 @@

 #define powf pow

 #define cosf cos

 #define sinf sin

+#define uint32_t uint

+unsigned char quantized8 = 0;

+int GS = 32;

 // ----------------------------------------------------------------------------

 // Transformer model

@@ -44,6 +50,11 @@

 #define SIZEOFCONFIG 24

+typedef struct {

+    int8_t* q;    // quantized values

+    float* s; // scaling factors

+} QuantizedTensor;

 int read4(int fd) {

 	typedef union _result {

 		char buf[4];

@@ -79,7 +90,7 @@

     float* wcls;

 } TransformerWeights;

-#define SIZEOFTRANSFORMERWEIGHTS (12*sizeof(float*))

+#define SIZEOFTRANSFORMERWEIGHTS (12*sizeof(void*))

 typedef struct {

     // current wave of activations

@@ -88,6 +99,8 @@

     float *xb2; // an additional buffer just for convenience (dim,)

     float *hb; // buffer for hidden dimension in the ffn (hidden_dim,)

     float *hb2; // buffer for hidden dimension in the ffn (hidden_dim,)

+	QuantizedTensor *xq; // quantized x (dim,)

+	QuantizedTensor *hq; // quantized hb (hidden_dim,)

     float *q; // query (dim,)

     float *k; // key (dim,)

     float *v; // value (dim,)

@@ -113,6 +126,8 @@

 #define SIZEOFTRANSFORMER (SIZEOFCONFIG+SIZEOFTRANSFORMERWEIGHTS+SIZEOFRUNSTATE+4+sizeof(float*)+sizeof(ssize_t))

 void malloc_run_state(RunState* s, Config* p) {

+	QuantizedTensor *xq;

+	QuantizedTensor *hq;

     // we calloc instead of malloc to keep valgrind happy

     int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;

     s->x = calloc(p->dim, sizeof(float));

@@ -120,7 +135,17 @@

     s->xb2 = calloc(p->dim, sizeof(float));

     s->hb = calloc(p->hidden_dim, sizeof(float));

     s->hb2 = calloc(p->hidden_dim, sizeof(float));

+	xq = calloc(1, sizeof(QuantizedTensor));

+	hq = calloc(1, sizeof(QuantizedTensor));

+	xq->q = calloc(p->dim, sizeof(int8_t));

+	xq->s = calloc(p->dim, sizeof(float));

+	hq->q = calloc(p->hidden_dim, sizeof(int8_t));

+	hq->s = calloc(p->hidden_dim, sizeof(float));

+	s->xq = xq;

+	s->hq = hq;

     s->q = calloc(p->dim, sizeof(float));

+	s->k = calloc(kv_dim, sizeof(float));

+	s->v = calloc(kv_dim, sizeof(float));

     s->key_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));

     s->value_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));

     s->att = calloc(p->n_heads * p->seq_len, sizeof(float));

@@ -127,6 +152,7 @@

     s->logits = calloc(p->vocab_size, sizeof(float));

     // ensure all mallocs went fine

     if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q

+     || !s->k || !s->v || !s->xq || !s->hq || !s->hq->s || !s->hq->q

      || !s->key_cache || !s->value_cache || !s->att || !s->logits) {

         fprintf(stderr, "malloc failed!\n");

         exit(EXIT_FAILURE);

@@ -139,13 +165,132 @@

     free(s->xb2);

     free(s->hb);

     free(s->hb2);

+    free(s->xq->q);

+    free(s->xq->s);

+    free(s->hq->q);

+    free(s->hq->s);

+	free(s->xq);

+	free(s->hq);

     free(s->q);

-    free(s->att);

+//  free(s->k);

+//  free(s->v);

+//  free(s->att);

     free(s->logits);

     free(s->key_cache);

     free(s->value_cache);

+float round(float in) {

+	float f;

+	f = fmod(in, 1.0);

+	if (in > 0) {

+		if (f < 0.5)

+			return floor(in);

+		return ceil(in);

+	}

+	if (f > -0.5)

+		return ceil(in);

+	return floor(in);

+}

+// ----------------------------------------------------------------------------

+// Quantization functions

+void dequantize(QuantizedTensor *qx, float* x, int n, int m) {

+	for (int j = 0; j < m; j++) {

+	    for (int i = 0; i < n; i++) {

+	        x[j * n + i] = qx->q[j * n + i] * qx->s[(j * n + i) / GS];

+	    }

+	}

+}

+void quantize(QuantizedTensor *qx, float* x, int n) {

+    int num_groups = n / GS;

+    float Q_MAX = 127.0f;

+    for (int group = 0; group < num_groups; group++) {

+        // find the max absolute value in the current group

+        float wmax = 0.0;

+        for (int i = 0; i < GS; i++) {

+            float val = fabs(x[group * GS + i]);

+            if (val > wmax) {

+                wmax = val;

+            }

+        }

+        // calculate and write the scaling factor

+        float scale = wmax / Q_MAX;

+        qx->s[group] = scale;

+        // calculate and write the quantized values

+        for (int i = 0; i < GS; i++) {

+            float quant_value = x[group * GS + i] / scale; // scale

+            int8_t quantized = (int8_t) round(quant_value); // round and clamp

+            qx->q[group * GS + i] = quantized;

+        }

+    }

+}

+/* initialize `n` x quantized tensor (with `size_each` elements), starting from memory pointed at *ptr */

+QuantizedTensor *init_quantized_tensors(void **ptr, int n, int size_each) {

+    void *p = *ptr;

+    QuantizedTensor *res = malloc(n * sizeof(QuantizedTensor));

+    for(int i=0; i<n; i++) {

+        /* map quantized int8 values*/

+        res[i].q = (int8_t*)p;

+        p = (int8_t*)p + size_each;

+        /* map scale factors */

+        res[i].s = (float*)p;

+        p = (float*)p + size_each / GS;

+    }

+    *ptr = p; // advance ptr to current position

+    return res;

+}

+void memory_map_weights_q8(TransformerWeights *w, Config* p, void* ptr, uint8_t shared_classifier) {

+	QuantizedTensor *q_tokens, *wq, *wk, *wv, *wo, *w1, *w2, *w3, *wcls;

+    int head_size = p->dim / p->n_heads;

+    // first are the parameters that are kept in fp32 (the rmsnorm (1D) weights)

+    float* fptr = (float*) ptr; // cast our pointer to float*

+    w->rms_att_weight = fptr;

+    fptr += p->n_layers * p->dim;

+    w->rms_ffn_weight = fptr;

+    fptr += p->n_layers * p->dim;

+    w->rms_final_weight = fptr;

+    fptr += p->dim;

+    // now read all the quantized weights

+    ptr = (void*)fptr; // now cast the pointer back to void*

+    q_tokens = init_quantized_tensors(&ptr, 1, p->vocab_size * p->dim);

+    // dequantize token embedding table

+    w->token_embedding_table = malloc(p->vocab_size * p->dim * sizeof(float));

+    dequantize(q_tokens, w->token_embedding_table, p->vocab_size * p->dim, 1);

+    wq = init_quantized_tensors(&ptr, p->n_layers, p->dim * (p->n_heads * head_size));

+	dequantize(wq, w->wq, p->dim * (p->n_heads * head_size), p->n_layers);

+    wk = init_quantized_tensors(&ptr, p->n_layers, p->dim * (p->n_kv_heads * head_size));

+	dequantize(wk, w->wk, p->dim * (p->n_kv_heads * head_size), p->n_layers);

+    wv = init_quantized_tensors(&ptr, p->n_layers, p->dim * (p->n_kv_heads * head_size));

+	dequantize(wv, w->wv, p->dim * (p->n_kv_heads * head_size), p->n_layers);

+    wo = init_quantized_tensors(&ptr, p->n_layers, (p->n_heads * head_size) * p->dim);

+	dequantize(wo, w->wo, (p->n_heads * head_size) * p->dim, p->n_layers);

+    w1 = init_quantized_tensors(&ptr, p->n_layers, p->dim * p->hidden_dim);

+	dequantize(w1, w->w1, p->dim * p->hidden_dim, p->n_layers);

+    w2 = init_quantized_tensors(&ptr, p->n_layers, p->hidden_dim * p->dim);

+	dequantize(w2, w->w2, p->hidden_dim * p->dim, p->n_layers);

+    w3 = init_quantized_tensors(&ptr, p->n_layers, p->dim * p->hidden_dim);

+	dequantize(w3, w->w3, p->dim * p->hidden_dim, p->n_layers);

+    wcls = shared_classifier ? q_tokens : init_quantized_tensors(&ptr, 1, p->dim * p->vocab_size);

+	dequantize(wcls, w->wcls, p->dim * p->vocab_size, 1);

+}

 void memory_map_weights(TransformerWeights *w, Config* p, float* ptr, int shared_weights) {

     int head_size = p->dim / p->n_heads;

     // make sure the multiplications below are done in 64bit to fit the parameter counts of 13B+ models

@@ -184,21 +329,23 @@

 	int ret;

 	int fdt;

 	Dir *dstat;

+	unsigned int magic;

+	int header_size = 28;

+	uint8_t shared_classifier;

+	int group_size;

     fdt = open(checkpoint, OREAD);

     if (fdt < 3) { fprintf(stderr, "Couldn't open file %s\n", checkpoint); exit(EXIT_FAILURE); }

-/*

-typedef struct {

-    int dim; // transformer dimension

-    int hidden_dim; // for ffn layers

-    int n_layers; // number of layers

-    int n_heads; // number of query heads

-    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)

-    int vocab_size; // vocabulary size, usually 256 (byte-level)

-    int seq_len; // max sequence length

-} Config;

-*/

+	if (read(fdt, &magic, 4) != 4) { exits("read magic"); }

+	if (magic == 0x616b3432) {

+		if (read(fdt, &magic, 4) != 4) { exits("read version"); }

+		if (magic != 2) { exits("version (quantized) is not 2"); }

+		quantized8 = 1;

+		header_size = 256;

+		magic = read4(fdt);

+	}

     // read in the config header

-    config->dim = read4(fdt);

+    config->dim = magic;

 	config->hidden_dim = read4(fdt);

 	config->n_layers = read4(fdt);

 	config->n_heads = read4(fdt);

@@ -206,6 +353,12 @@

 	config->vocab_size = read4(fdt);

 	config->seq_len = read4(fdt);

+	if (quantized8 == 1) {

+		if (read(fdt, &shared_classifier, 1) != 1) exits("read shared_classifier");

+		if (read(fdt, &group_size, 4) != 4) exits("read group_size");

+		GS = group_size;

+	}

     // negative vocab size is hacky way of signaling unshared weights. bit yikes.

     int shared_weights = config->vocab_size > 0 ? 1 : 0;

     config->vocab_size = abs(config->vocab_size);

@@ -231,8 +384,11 @@

 	close(*fd);

 	*fd = open(checkpoint, OREAD);

-    float* weights_ptr = (float*)((char*)(*data) + 28);

-    memory_map_weights(weights, config, weights_ptr, shared_weights);

+    float* weights_ptr = (float*)((char*)(*data) + header_size);

+	if (quantized8 == 0)

+	    memory_map_weights(weights, config, weights_ptr, shared_weights);

+	else

+		memory_map_weights_q8(weights, config, weights_ptr, shared_classifier);

 void build_transformer(Transformer *t, char* checkpoint_path) {

@@ -1008,8 +1164,7 @@

     exit(EXIT_FAILURE);

-int main(int argc, char *argv[]) {

+void main(int argc, char *argv[]) {

     // default parameters

     char *checkpoint_path = NULL;  // e.g. out/model.bin

     char *tokenizer_path = "tokenizer.bin";

@@ -1073,6 +1228,7 @@

     free_sampler(&sampler);

     free_tokenizer(&tokenizer);

     free_transformer(&transformer);

-    return 0;

+	exits(nil);

--

⑨