diff --git a/.devops/tools.sh b/.devops/tools.sh index b0196b6..ece9e4e 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -23,7 +23,7 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then echo "Skip model quantization, it already exists: ${i/f16/q4_0}" else echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." - ./quantize "$i" "${i/f16/q4_0}" 2 + ./quantize "$i" "${i/f16/q4_0}" q4_0 fi done else diff --git a/README.md b/README.md index 44cf721..509df61 100644 --- a/README.md +++ b/README.md @@ -203,8 +203,8 @@ python3 -m pip install -r requirements.txt # convert the 7B model to ggml FP16 format python3 convert.py models/7B/ -# quantize the model to 4-bits (using method 2 = q4_0) -./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2 +# quantize the model to 4-bits (using q4_0 method) +./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0 # run the inference ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index ad39a80..ec7f91a 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -2,8 +2,17 @@ #include "llama.h" #include +#include #include +static const std::map LLAMA_FTYPE_MAP = { + {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, + {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, + {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2}, + {"q4_3", LLAMA_FTYPE_MOSTLY_Q4_3}, + {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, +}; + // usage: // ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type // @@ -12,11 +21,9 @@ int main(int argc, char ** argv) { if (argc < 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]); - fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); - fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); - fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2); - fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3); - fprintf(stderr, " type = %d - q8_0\n", LLAMA_FTYPE_MOSTLY_Q8_0); + for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) { + fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second); + } return 1; } @@ -30,7 +37,18 @@ int main(int argc, char ** argv) { const std::string fname_inp = argv[1]; const std::string fname_out = argv[2]; - const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]); + enum llama_ftype ftype; + if (argv[3][0] == 'q') { + auto it = LLAMA_FTYPE_MAP.find(argv[3]); + if (it == LLAMA_FTYPE_MAP.end()) { + fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]); + return 1; + } + ftype = it->second; + } else { + ftype = (enum llama_ftype)atoi(argv[3]); + } + int nthread = argc > 4 ? atoi(argv[4]) : 0; const int64_t t_main_start_us = ggml_time_us();