Compare commits
97 Commits
quant-attn
...
master
Author | SHA1 | Date |
---|---|---|
|
a90e96b266 | 1 year ago |
|
94c5652fc0 | 1 year ago |
|
34d9f22f44 | 1 year ago |
|
d3e8093e9b | 1 year ago |
|
360cfe5bec | 1 year ago |
|
2edbdb0f99 | 1 year ago |
|
20fbf2a2a0 | 1 year ago |
|
db1080876a | 1 year ago |
|
c65a7fbfa9 | 1 year ago |
|
f647ce040f | 1 year ago |
|
799fdc1b5d | 1 year ago |
|
6daa09d879 | 1 year ago |
|
bca9ad938a | 1 year ago |
|
e2a937ca6a | 1 year ago |
|
b0c71c7b6d | 1 year ago |
|
a8a2efdc81 | 1 year ago |
|
e216aa0463 | 1 year ago |
|
2485d7a4d3 | 1 year ago |
|
13b0c68ed7 | 1 year ago |
|
55bc5f0900 | 1 year ago |
|
9daff419f6 | 1 year ago |
|
bf4b22ffe4 | 1 year ago |
|
67c77799e0 | 1 year ago |
|
0e6cbff1b7 | 1 year ago |
|
5d5817ca60 | 1 year ago |
|
8c9be35ff9 | 1 year ago |
|
cc0bb7235c | 1 year ago |
|
2bb992f034 | 1 year ago |
|
e2cd506999 | 1 year ago |
|
2d099e5193 | 1 year ago |
|
f4cef87edf | 1 year ago |
|
58b367c2d7 | 1 year ago |
|
ea3a0ad6b6 | 1 year ago |
|
2bdc09646d | 1 year ago |
|
70269cae37 | 1 year ago |
|
b925f1f1b0 | 1 year ago |
|
90b19bd6ee | 1 year ago |
|
7ff0dcd320 | 1 year ago |
|
6f79699286 | 1 year ago |
|
a5d30b1f53 | 1 year ago |
|
76a884920a | 1 year ago |
|
6bc4400e67 | 1 year ago |
|
f0d70f147d | 1 year ago |
|
3e5aa8a1c4 | 1 year ago |
|
c3ca7a5f05 | 1 year ago |
|
e8c051611a | 1 year ago |
|
0b5a935099 | 1 year ago |
|
ec728e44d7 | 1 year ago |
|
214b6a3570 | 1 year ago |
|
305eb5afd5 | 1 year ago |
|
84ca9c2ecf | 1 year ago |
|
334637e43e | 1 year ago |
|
dd7eff57d8 | 1 year ago |
|
7fc50c051a | 1 year ago |
|
b1ee8f59b4 | 1 year ago |
|
36d19a603b | 1 year ago |
|
7f15c5c477 | 1 year ago |
|
55390bcaf2 | 1 year ago |
|
5fba3c016b | 1 year ago |
|
1481a9cf25 | 1 year ago |
|
11d902364b | 1 year ago |
|
7296c961d9 | 1 year ago |
|
78ec543733 | 1 year ago |
|
92a6e13a31 | 1 year ago |
|
04aaae1d79 | 1 year ago |
|
0b2da20538 | 1 year ago |
|
f9be42add0 | 1 year ago |
|
574406dc7e | 1 year ago |
|
87a6f846d3 | 1 year ago |
|
ea3ad7eb60 | 1 year ago |
|
859fee6dfb | 1 year ago |
|
4afcc37869 | 1 year ago |
|
667c501334 | 1 year ago |
|
bb98e77be7 | 1 year ago |
|
7a32fcb3b2 | 1 year ago |
|
dd0eabc049 | 1 year ago |
|
54bb60e268 | 1 year ago |
|
8a0f8673ba | 1 year ago |
|
0c5692345d | 1 year ago |
|
957c8ae21d | 1 year ago |
|
9b0a4d4214 | 1 year ago |
|
2ec83428de | 1 year ago |
|
e4cf982e0d | 1 year ago |
|
c4fe84fb0d | 1 year ago |
|
1d78fecdab | 1 year ago |
|
284685f169 | 1 year ago |
|
edce63baa9 | 1 year ago |
|
ec9cdb6752 | 1 year ago |
|
e4422e299c | 1 year ago |
|
53c8434398 | 1 year ago |
|
c6524f46eb | 1 year ago |
|
c9e2c26f41 | 1 year ago |
|
0e018fe008 | 1 year ago |
|
857308d1e8 | 1 year ago |
|
c50b628810 | 1 year ago |
|
5f939498d5 | 1 year ago |
|
36b4f7e064 | 1 year ago |
@ -0,0 +1,7 @@
|
||||
set(TARGET benchmark)
|
||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
@ -0,0 +1,21 @@
|
||||
# llama.cpp/example/jeopardy
|
||||
|
||||
This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
|
||||
|
||||
The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
|
||||
|
||||
|
||||
Step 1: Open jeopardy.sh and modify the following:
|
||||
```
|
||||
MODEL=(path to your model)
|
||||
MODEL_NAME=(name of your model)
|
||||
prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
|
||||
opts=(add -instruct here if needed for your model, or anything else you want to test out)
|
||||
```
|
||||
Step 2: Run `jeopardy.sh` from the llama.cpp folder
|
||||
|
||||
Step 3: Repeat steps 1 and 2 until you have all the results you need.
|
||||
|
||||
Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph.
|
||||
|
||||
Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid.
|
@ -0,0 +1,56 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import sys, os
|
||||
import csv
|
||||
|
||||
labels = []
|
||||
numbers = []
|
||||
numEntries = 1
|
||||
|
||||
rows = []
|
||||
|
||||
def bar_chart(numbers, labels, pos):
|
||||
plt.bar(pos, numbers, color='blue')
|
||||
plt.xticks(ticks=pos, labels=labels)
|
||||
plt.title("Jeopardy Results by Model")
|
||||
plt.xlabel("Model")
|
||||
plt.ylabel("Questions Correct")
|
||||
plt.show()
|
||||
|
||||
def calculatecorrect():
|
||||
directory = os.fsencode("./examples/jeopardy/results/")
|
||||
csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
|
||||
for row in csv_reader:
|
||||
global rows
|
||||
rows.append(row)
|
||||
for listing in os.listdir(directory):
|
||||
filename = os.fsdecode(listing)
|
||||
if filename.endswith(".txt"):
|
||||
file = open("./examples/jeopardy/results/" + filename, "rt")
|
||||
global labels
|
||||
global numEntries
|
||||
global numbers
|
||||
labels.append(filename[:-4])
|
||||
numEntries += 1
|
||||
i = 1
|
||||
totalcorrect = 0
|
||||
for line in file.readlines():
|
||||
if line.strip() != "------":
|
||||
print(line)
|
||||
else:
|
||||
print("Correct answer: " + rows[i][2] + "\n")
|
||||
i+=1
|
||||
print("Did the AI get the question right? (y/n)")
|
||||
if input() == "y":
|
||||
totalcorrect += 1
|
||||
numbers.append(totalcorrect)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
calculatecorrect()
|
||||
pos = list(range(numEntries))
|
||||
labels.append("Human")
|
||||
numbers.append(48.11)
|
||||
bar_chart(numbers, labels, pos)
|
||||
print(labels)
|
||||
print(numbers)
|
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
|
||||
MODEL_NAME=Vicuna
|
||||
|
||||
# exec options
|
||||
prefix="Human: " # Ex. Vicuna uses "Human: "
|
||||
opts="--temp 0 -n 80" # additional flags
|
||||
nl='
|
||||
'
|
||||
introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)."
|
||||
|
||||
# file options
|
||||
question_file=./examples/jeopardy/questions.txt
|
||||
touch ./examples/jeopardy/results/$MODEL_NAME.txt
|
||||
output_file=./examples/jeopardy/results/$MODEL_NAME.txt
|
||||
|
||||
counter=1
|
||||
|
||||
echo 'Running'
|
||||
while IFS= read -r question
|
||||
do
|
||||
exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
||||
echo $counter
|
||||
echo "Current Question: $question"
|
||||
eval "$exe_cmd"
|
||||
echo -e "\n------" >> $output_file
|
||||
counter=$((counter+1))
|
||||
done < "$question_file"
|
Can't render this file because it has a wrong number of fields in line 34.
|
@ -1,3 +1,289 @@
|
||||
# main
|
||||
# llama.cpp/example/main
|
||||
|
||||
TODO
|
||||
This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Quick Start](#quick-start)
|
||||
2. [Common Options](#common-options)
|
||||
3. [Input Prompts](#input-prompts)
|
||||
4. [Interaction](#interaction)
|
||||
5. [Context Management](#context-management)
|
||||
6. [Generation Flags](#generation-flags)
|
||||
7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
|
||||
8. [Additional Options](#additional-options)
|
||||
|
||||
## Quick Start
|
||||
|
||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||
|
||||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
#### Windows:
|
||||
|
||||
```powershell
|
||||
main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
For an interactive experience, try this command:
|
||||
|
||||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
|
||||
'User: Hi
|
||||
AI: Hello. I am an AI chatbot. Would you like to talk?
|
||||
User: Sure!
|
||||
AI: What would you like to talk about?
|
||||
User:'
|
||||
```
|
||||
|
||||
#### Windows:
|
||||
|
||||
```powershell
|
||||
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
|
||||
```
|
||||
|
||||
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||
|
||||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
|
||||
```
|
||||
|
||||
#### Windows:
|
||||
|
||||
```powershell
|
||||
main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
|
||||
```
|
||||
|
||||
## Common Options
|
||||
|
||||
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
||||
|
||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
|
||||
- `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||
- `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||
|
||||
## Input Prompts
|
||||
|
||||
The `main` program provides several ways to interact with the LLaMA models using input prompts:
|
||||
|
||||
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
|
||||
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
|
||||
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
|
||||
- `--random-prompt`: Start with a randomized prompt.
|
||||
|
||||
## Interaction
|
||||
|
||||
The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
|
||||
|
||||
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
|
||||
|
||||
### Interaction Options
|
||||
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
|
||||
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
|
||||
- `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
|
||||
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
|
||||
|
||||
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
|
||||
|
||||
### Reverse Prompts
|
||||
|
||||
Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered:
|
||||
|
||||
- `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space.
|
||||
|
||||
To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt.
|
||||
|
||||
### In-Prefix
|
||||
|
||||
The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
|
||||
|
||||
```sh
|
||||
./main -r "User:" --in-prefix " "
|
||||
```
|
||||
|
||||
### In-Suffix
|
||||
|
||||
The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
|
||||
|
||||
```sh
|
||||
./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
|
||||
```
|
||||
|
||||
### Instruction Mode
|
||||
|
||||
Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
|
||||
|
||||
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
|
||||
|
||||
Technical detail: the user's input is internally prefixed with the reverse prompt (or `### Instruction:` as the default), and followed by `### Response:` (except if you just press Return without any input, to keep generating a longer response).
|
||||
|
||||
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
|
||||
|
||||
## Context Management
|
||||
|
||||
During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
|
||||
|
||||
### Context Size
|
||||
|
||||
The `--ctx_size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
|
||||
|
||||
- `-c N, --ctx_size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
|
||||
|
||||
### Keep Prompt
|
||||
|
||||
The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
|
||||
|
||||
- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
||||
|
||||
By utilizing context management options like `--ctx_size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
|
||||
|
||||
## Generation Flags
|
||||
|
||||
The following options allow you to control the text generation process and fine-tune the diversity, creativity, and quality of the generated text according to your needs. By adjusting these options and experimenting with different combinations of values, you can find the best settings for your specific use case.
|
||||
|
||||
### Number of Tokens to Predict
|
||||
|
||||
- `-n N, --n_predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
|
||||
|
||||
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
|
||||
|
||||
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
|
||||
|
||||
### Temperature
|
||||
|
||||
- `--temp N`: Adjust the randomness of the generated text (default: 0.8).
|
||||
|
||||
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
|
||||
|
||||
Example usage: `--temp 0.5`
|
||||
|
||||
### Repeat Penalty
|
||||
|
||||
- `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
|
||||
- `--repeat_last_n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx_size).
|
||||
- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
|
||||
|
||||
The `repeat_penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
|
||||
|
||||
The `repeat_last_n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx_size`).
|
||||
|
||||
Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
|
||||
|
||||
Example usage: `--repeat_penalty 1.15 --repeat_last_n 128 --no-penalize-nl`
|
||||
|
||||
### Top-K Sampling
|
||||
|
||||
- `--top_k N`: Limit the next token selection to the K most probable tokens (default: 40).
|
||||
|
||||
Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
|
||||
|
||||
Example usage: `--top_k 30`
|
||||
|
||||
### Top-P Sampling
|
||||
|
||||
- `--top_p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
||||
|
||||
Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
|
||||
|
||||
Example usage: `--top_p 0.95`
|
||||
|
||||
### Tail Free Sampling (TFS)
|
||||
|
||||
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
|
||||
|
||||
Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced.
|
||||
|
||||
Example usage: `--tfs 2.0`
|
||||
|
||||
### Locally Typical Sampling
|
||||
|
||||
- `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
|
||||
|
||||
Locally typical sampling promotes the generation of contextually coherent and diverse text by sampling tokens that are typical or expected based on the surrounding context. By setting the parameter p between 0 and 1, you can control the balance between producing text that is locally coherent and diverse. A value closer to 1 will promote more contextually coherent tokens, while a value closer to 0 will promote more diverse tokens. A value equal to 1 disables locally typical sampling.
|
||||
|
||||
Example usage: `--typical 0.9`
|
||||
|
||||
### Mirostat Sampling
|
||||
|
||||
- `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
|
||||
- `--mirostat_lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
|
||||
- `--mirostat_ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
|
||||
|
||||
Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).
|
||||
|
||||
The `--mirostat_lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
|
||||
|
||||
The `--mirostat_ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
|
||||
|
||||
Example usage: `--mirostat 2 --mirostat_lr 0.05 --mirostat_ent 3.0`
|
||||
|
||||
### Logit Bias
|
||||
|
||||
- `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
|
||||
|
||||
The logit bias option allows you to manually adjust the likelihood of specific tokens appearing in the generated text. By providing a token ID and a positive or negative bias value, you can increase or decrease the probability of that token being generated.
|
||||
|
||||
For example, use `--logit-bias 15043+1` to increase the likelihood of the token 'Hello', or `--logit-bias 15043-1` to decrease its likelihood. Using a value of negative infinity, `--logit-bias 15043-inf` ensures that the token `Hello` is never produced.
|
||||
|
||||
A more practical use case might be to prevent the generation of `\code{begin}` and `\code{end}` by setting the `\` token (29905) to negative infinity with `-l 29905-inf`. (This is due to the prevalence of LaTeX codes that show up in LLaMA model inference.)
|
||||
|
||||
Example usage: `--logit-bias 29905-inf`
|
||||
|
||||
### RNG Seed
|
||||
|
||||
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
|
||||
|
||||
The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
|
||||
|
||||
## Performance Tuning and Memory Options
|
||||
|
||||
These options help improve the performance and memory usage of the LLaMA models. By adjusting these settings, you can fine-tune the model's behavior to better suit your system's capabilities and achieve optimal performance for your specific use case.
|
||||
|
||||
### Number of Threads
|
||||
|
||||
- `-t N, --threads N`: Set the number of threads to use during computation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
|
||||
|
||||
### Mlock
|
||||
|
||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. This can improve performance but trades away some of the advantages of memory-mapping by requiring more RAM to run and potentially slowing down load times as the model loads into RAM.
|
||||
|
||||
### No Memory Mapping
|
||||
|
||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
|
||||
|
||||
### Memory Float 32
|
||||
|
||||
- `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
|
||||
|
||||
### Batch Size
|
||||
|
||||
- `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
|
||||
|
||||
### Session Caching
|
||||
|
||||
- `--session FNAME`: Specify a file to load/save the session, which caches the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The session file is created during the first run and is reused in subsequent runs. If you change your prompt such that 75% or less of the session is reusable, the existing session file will be overwritten with a new, updated version to maintain optimal performance.
|
||||
|
||||
### Quantization
|
||||
|
||||
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
|
||||
|
||||
## Additional Options
|
||||
|
||||
These options provide extra functionality and customization when running the LLaMA models:
|
||||
|
||||
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
|
||||
- `--verbose-prompt`: Print the prompt before generating text.
|
||||
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
|
@ -0,0 +1,7 @@
|
||||
set(TARGET save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
@ -0,0 +1,151 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
#include <chrono>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
params.seed = 42;
|
||||
params.n_threads = 4;
|
||||
params.repeat_last_n = 64;
|
||||
params.prompt = "The quick brown fox";
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
|
||||
if (params.n_predict < 0) {
|
||||
params.n_predict = 16;
|
||||
}
|
||||
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_parts = params.n_parts;
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
|
||||
auto n_past = 0;
|
||||
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
||||
|
||||
// init
|
||||
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
auto tokens = std::vector<llama_token>(params.n_ctx);
|
||||
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
|
||||
|
||||
if (n_prompt_tokens < 1) {
|
||||
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// evaluate prompt
|
||||
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
|
||||
|
||||
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
|
||||
n_past += n_prompt_tokens;
|
||||
|
||||
const size_t state_size = llama_get_state_size(ctx);
|
||||
uint8_t * state_mem = new uint8_t[state_size];
|
||||
|
||||
// Save state (rng, logits, embedding and kv_cache) to file
|
||||
{
|
||||
FILE *fp_write = fopen("dump_state.bin", "wb");
|
||||
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
|
||||
fwrite(state_mem, 1, state_size, fp_write);
|
||||
fclose(fp_write);
|
||||
}
|
||||
|
||||
// save state (last tokens)
|
||||
const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
|
||||
const auto n_past_saved = n_past;
|
||||
|
||||
// first run
|
||||
printf("\n%s", params.prompt.c_str());
|
||||
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
auto next_token = llama_sample_token(ctx, &candidates_p);
|
||||
auto next_token_str = llama_token_to_str(ctx, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
|
||||
printf("%s", next_token_str);
|
||||
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
printf("\n\n");
|
||||
|
||||
// free old model
|
||||
llama_free(ctx);
|
||||
|
||||
// load new model
|
||||
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
|
||||
|
||||
// Load state (rng, logits, embedding and kv_cache) from file
|
||||
{
|
||||
FILE *fp_read = fopen("dump_state.bin", "rb");
|
||||
if (state_size != llama_get_state_size(ctx2)) {
|
||||
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const size_t ret = fread(state_mem, 1, state_size, fp_read);
|
||||
if (ret != state_size) {
|
||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
|
||||
fclose(fp_read);
|
||||
}
|
||||
|
||||
delete[] state_mem;
|
||||
|
||||
// restore state (last tokens)
|
||||
last_n_tokens_data = last_n_tokens_data_saved;
|
||||
n_past = n_past_saved;
|
||||
|
||||
// second run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto logits = llama_get_logits(ctx2);
|
||||
auto n_vocab = llama_n_vocab(ctx2);
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
||||
auto next_token_str = llama_token_to_str(ctx2, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
|
||||
printf("%s", next_token_str);
|
||||
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
printf("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,398 @@
|
||||
#include "ggml-opencl.h"
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 110
|
||||
#include <clblast_c.h>
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
||||
const char * clblast_dequant = MULTILINE_QUOTE(
|
||||
|
||||
struct block_q4_0
|
||||
{
|
||||
float d;
|
||||
uchar qs[16];
|
||||
};
|
||||
|
||||
__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
|
||||
const uint i = get_global_id(0) / 32;
|
||||
const uint l = get_local_id(0);
|
||||
|
||||
const float d = blocks[i].d;
|
||||
|
||||
const uchar vi = blocks[i].qs[l];
|
||||
|
||||
const uint index = i*32 + l*2;
|
||||
result[index + 0] = ((vi & 0xf) - 8)*d;
|
||||
result[index + 1] = ((vi >> 4) - 8)*d;
|
||||
}
|
||||
|
||||
struct block_q4_1
|
||||
{
|
||||
float d;
|
||||
float m;
|
||||
uchar qs[16];
|
||||
};
|
||||
|
||||
__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
|
||||
const uint i = get_global_id(0) / 32;
|
||||
const uint l = get_local_id(0);
|
||||
|
||||
const float d = blocks[i].d;
|
||||
const float m = blocks[i].m;
|
||||
|
||||
const uchar vi = blocks[i].qs[l];
|
||||
|
||||
const uint index = i*32 + l*2;
|
||||
result[index + 0] = (vi & 0xf) * d + m;
|
||||
result[index + 1] = (vi >> 4) * d + m;
|
||||
}
|
||||
|
||||
struct block_q4_2
|
||||
{
|
||||
ushort d;
|
||||
uchar qs[8];
|
||||
};
|
||||
|
||||
__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
|
||||
const uint i = get_global_id(0) / 16;
|
||||
const uint l = get_local_id(0);
|
||||
|
||||
const float d = vload_half(0, (__global half*) &blocks[i].d);
|
||||
|
||||
const uchar vi = blocks[i].qs[l];
|
||||
|
||||
const uint index = i*16 + l*2;
|
||||
result[index + 0] = ((vi & 0xf) - 8)*d;
|
||||
result[index + 1] = ((vi >> 4) - 8)*d;
|
||||
}
|
||||
|
||||
|
||||
struct block_q5_0
|
||||
{
|
||||
float d;
|
||||
uint qh;
|
||||
uchar qs[16];
|
||||
};
|
||||
|
||||
__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
|
||||
const uint i = get_global_id(0) / 32;
|
||||
const uint l = get_local_id(0);
|
||||
|
||||
const float d = blocks[i].d;
|
||||
|
||||
const uchar vi = blocks[i].qs[l];
|
||||
|
||||
const uint l2 = l * 2;
|
||||
|
||||
const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
|
||||
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
|
||||
|
||||
const uint index = i*32 + l2;
|
||||
result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
|
||||
result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
|
||||
}
|
||||
|
||||
struct block_q5_1
|
||||
{
|
||||
ushort d;
|
||||
ushort m;
|
||||
uint qh;
|
||||
uchar qs[16];
|
||||
};
|
||||
|
||||
__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
|
||||
const uint i = get_global_id(0) / 32;
|
||||
const uint l = get_local_id(0);
|
||||
|
||||
const float d = vload_half(0, (__global half*) &blocks[i].d);
|
||||
const float m = vload_half(0, (__global half*) &blocks[i].m);
|
||||
|
||||
const uchar vi = blocks[i].qs[l];
|
||||
|
||||
const uint l2 = l * 2;
|
||||
|
||||
const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
|
||||
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
|
||||
|
||||
const uint index = i*32 + l2;
|
||||
result[index + 0] = ((vi & 0xf) | vh0)*d + m;
|
||||
result[index + 1] = ((vi >> 4) | vh1)*d + m;
|
||||
}
|
||||
|
||||
struct block_q8_0
|
||||
{
|
||||
float d;
|
||||
char qs[32];
|
||||
};
|
||||
|
||||
__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
|
||||
const uint i = get_global_id(0) / 32;
|
||||
const uint l = get_local_id(0);
|
||||
|
||||
result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
#define CL_CHECK(err, name) \
|
||||
do { \
|
||||
cl_int err_ = (err); \
|
||||
if (err_ != CL_SUCCESS) { \
|
||||
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define QK5_0 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
||||
} block_q5_0;
|
||||
|
||||
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
uint32_t qh; // 5-th bit of quants
|
||||
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
||||
} cl_block_q5_0;
|
||||
|
||||
static cl_platform_id platform;
|
||||
static cl_device_id device;
|
||||
static cl_context context;
|
||||
static cl_command_queue queue;
|
||||
static cl_program program;
|
||||
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
|
||||
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
|
||||
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
|
||||
|
||||
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
||||
cl_program p;
|
||||
char *program_log;
|
||||
size_t program_size, log_size;
|
||||
int err;
|
||||
|
||||
program_size = strlen(program_buffer);
|
||||
|
||||
p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
|
||||
if(err < 0) {
|
||||
fprintf(stderr, "OpenCL error creating program");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
|
||||
if(err < 0) {
|
||||
|
||||
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
||||
program_log = (char*) malloc(log_size + 1);
|
||||
program_log[log_size] = '\0';
|
||||
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
|
||||
printf("%s\n", program_log);
|
||||
free(program_log);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
void ggml_cl_init(void) {
|
||||
cl_int err = 0;
|
||||
char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
|
||||
char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
|
||||
int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
|
||||
int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
|
||||
printf("\nInitializing CLBlast (First Run)...");
|
||||
printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
|
||||
cl_uint num_platforms;
|
||||
clGetPlatformIDs(0, NULL, &num_platforms);
|
||||
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
|
||||
clGetPlatformIDs(num_platforms, platforms, NULL);
|
||||
platform = platforms[plat_num];
|
||||
char platform_buffer[1024];
|
||||
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
|
||||
cl_uint num_devices;
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
||||
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
|
||||
device = devices[dev_num];
|
||||
char device_buffer[1024];
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
|
||||
printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
|
||||
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
|
||||
CL_CHECK(err, "clCreateContext");
|
||||
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
|
||||
CL_CHECK(err, "clCreateCommandQueue");
|
||||
|
||||
free(platforms);
|
||||
free(devices);
|
||||
|
||||
program = build_program_from_source(context, device, clblast_dequant);
|
||||
|
||||
// Prepare dequantize kernels
|
||||
kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
}
|
||||
|
||||
static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
|
||||
if (req_size <= *cur_size) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Reallocate buffer with enough space
|
||||
if (*cur_size > 0) {
|
||||
clReleaseMemObject(*buf);
|
||||
}
|
||||
cl_int err;
|
||||
*buf = clCreateBuffer(context, flags, req_size, NULL, &err);
|
||||
*cur_size = req_size;
|
||||
CL_CHECK(err, "clCreateBuffer");
|
||||
}
|
||||
|
||||
void ggml_cl_sgemm_wrapper(
|
||||
const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
|
||||
const int m, const int n, const int k,
|
||||
const float alpha, const void *host_a, const int lda,
|
||||
const float *host_b, const int ldb, const float beta,
|
||||
float *host_c, const int ldc, const int btype) {
|
||||
cl_int err = 0;
|
||||
|
||||
cl_kernel kernel;
|
||||
size_t global = n * k, local, size_qb;
|
||||
bool dequant;
|
||||
cl_block_q5_0* cl_host_b;
|
||||
|
||||
switch (btype) {
|
||||
case GGML_TYPE_F32:
|
||||
dequant = false;
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
dequant = true;
|
||||
kernel = kernel_q4_0;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(float) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
dequant = true;
|
||||
kernel = kernel_q4_1;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(float) * 2 + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q4_2:
|
||||
dequant = true;
|
||||
kernel = kernel_q4_2;
|
||||
local = 8;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
dequant = true;
|
||||
kernel = kernel_q5_0;
|
||||
local = 16;
|
||||
// For some reason OpenCL seems to be incapable of working with structs of size 22.
|
||||
// 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
|
||||
// TODO Find the reason, fix and remove workaround.
|
||||
const block_q5_0* b = (const block_q5_0*) host_b;
|
||||
cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
|
||||
for (size_t i = 0; i < global / 32; i++) {
|
||||
cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
|
||||
memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
|
||||
memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
|
||||
}
|
||||
host_b = (const float*) cl_host_b;
|
||||
size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
dequant = true;
|
||||
kernel = kernel_q5_1;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
dequant = true;
|
||||
kernel = kernel_q8_0;
|
||||
local = 32;
|
||||
size_qb = global * (sizeof(float) + local) / 32;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
|
||||
abort();
|
||||
}
|
||||
|
||||
const size_t size_a = m * k * sizeof(float);
|
||||
const size_t size_b = n * k * sizeof(float);
|
||||
const size_t size_c = m * n * sizeof(float);
|
||||
|
||||
// Prepare buffers
|
||||
ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
|
||||
if (dequant) {
|
||||
ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
|
||||
}
|
||||
ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
|
||||
ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
|
||||
|
||||
cl_event ev_a, ev_qb, ev_b;
|
||||
|
||||
if (dequant) {
|
||||
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
|
||||
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
|
||||
CL_CHECK(err, "clSetKernelArg");
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer qb");
|
||||
} else {
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer b");
|
||||
}
|
||||
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer a");
|
||||
if (dequant) {
|
||||
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
|
||||
CL_CHECK(err, "clEnqueueNDRangeKernel");
|
||||
clReleaseEvent(ev_qb);
|
||||
}
|
||||
clWaitForEvents(1, &ev_a);
|
||||
clWaitForEvents(1, &ev_b);
|
||||
clReleaseEvent(ev_a);
|
||||
clReleaseEvent(ev_b);
|
||||
|
||||
cl_event ev_sgemm;
|
||||
CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
|
||||
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
|
||||
m, n, k,
|
||||
alpha,
|
||||
cl_buffer_a, 0, lda,
|
||||
cl_buffer_b, 0, ldb,
|
||||
beta,
|
||||
cl_buffer_c, 0, ldc,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != CLBlastSuccess) {
|
||||
fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
|
||||
abort();
|
||||
}
|
||||
|
||||
cl_event ev_c;
|
||||
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
|
||||
|
||||
// Wait for completion
|
||||
clWaitForEvents(1, &ev_c);
|
||||
clReleaseEvent(ev_sgemm);
|
||||
clReleaseEvent(ev_c);
|
||||
if (btype == GGML_TYPE_Q5_0) {
|
||||
free((void*) cl_host_b);
|
||||
}
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_cl_init(void);
|
||||
|
||||
enum ggml_blas_order {
|
||||
GGML_BLAS_ORDER_ROW_MAJOR = 101,
|
||||
GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
|
||||
};
|
||||
|
||||
enum ggml_blas_op {
|
||||
GGML_BLAS_OP_N = 111,
|
||||
GGML_BLAS_OP_T = 112,
|
||||
GGML_BLAS_OP_C = 113,
|
||||
};
|
||||
|
||||
void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -0,0 +1,7 @@
|
||||
A chat between a curious human ("[[USER_NAME]]") and an artificial intelligence assistant ("[[AI_NAME]]"). The assistant gives helpful, detailed, and polite answers to the human's questions.
|
||||
|
||||
### [[USER_NAME]]: Hello, [[AI_NAME]].
|
||||
### [[AI_NAME]]: Hello. How may I help you today?
|
||||
### [[USER_NAME]]: Please tell me the largest city in Europe.
|
||||
### [[AI_NAME]]: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
||||
### [[USER_NAME]]:
|
@ -0,0 +1,7 @@
|
||||
A chat between a curious human ("[[USER_NAME]]") and an artificial intelligence assistant ("[[AI_NAME]]"). The assistant gives helpful, detailed, and polite answers to the human's questions.
|
||||
|
||||
[[USER_NAME]]: Hello, [[AI_NAME]].
|
||||
[[AI_NAME]]: Hello. How may I help you today?
|
||||
[[USER_NAME]]: Please tell me the largest city in Europe.
|
||||
[[AI_NAME]]: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
||||
[[USER_NAME]]:
|
@ -0,0 +1,53 @@
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
|
||||
set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
|
||||
set(BUILD_NUMBER 0)
|
||||
set(BUILD_COMMIT "unknown")
|
||||
|
||||
# Look for git
|
||||
find_package(Git)
|
||||
if(NOT Git_FOUND)
|
||||
execute_process(
|
||||
COMMAND which git
|
||||
OUTPUT_VARIABLE GIT_EXECUTABLE
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
if(NOT GIT_EXECUTABLE STREQUAL "")
|
||||
set(Git_FOUND TRUE)
|
||||
message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}")
|
||||
else()
|
||||
message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Get the commit count and hash
|
||||
if(Git_FOUND)
|
||||
execute_process(
|
||||
COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE HEAD
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
RESULT_VARIABLE GIT_HEAD_RESULT
|
||||
)
|
||||
execute_process(
|
||||
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE COUNT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
RESULT_VARIABLE GIT_COUNT_RESULT
|
||||
)
|
||||
if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
|
||||
set(BUILD_COMMIT ${HEAD})
|
||||
set(BUILD_NUMBER ${COUNT})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Only write the header if it's changed to prevent unnecessary recompilation
|
||||
if(EXISTS ${HEADER_FILE})
|
||||
file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"")
|
||||
list(GET CONTENTS 0 EXISTING)
|
||||
if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"")
|
||||
configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
|
||||
endif()
|
||||
else()
|
||||
configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
|
||||
endif()
|
@ -0,0 +1,7 @@
|
||||
#ifndef BUILD_INFO_H
|
||||
#define BUILD_INFO_H
|
||||
|
||||
#define BUILD_NUMBER @BUILD_NUMBER@
|
||||
#define BUILD_COMMIT "@BUILD_COMMIT@"
|
||||
|
||||
#endif // BUILD_INFO_H
|
@ -0,0 +1,22 @@
|
||||
#!/bin/sh
|
||||
|
||||
BUILD_NUMBER="0"
|
||||
BUILD_COMMIT="unknown"
|
||||
|
||||
REV_LIST=$(git rev-list --count HEAD)
|
||||
if [ $? -eq 0 ]; then
|
||||
BUILD_NUMBER=$REV_LIST
|
||||
fi
|
||||
|
||||
REV_PARSE=$(git rev-parse --short HEAD)
|
||||
if [ $? -eq 0 ]; then
|
||||
BUILD_COMMIT=$REV_PARSE
|
||||
fi
|
||||
|
||||
echo "#ifndef BUILD_INFO_H"
|
||||
echo "#define BUILD_INFO_H"
|
||||
echo ""
|
||||
echo "#define BUILD_NUMBER $BUILD_NUMBER"
|
||||
echo "#define BUILD_COMMIT \"$BUILD_COMMIT\""
|
||||
echo ""
|
||||
echo "#endif // BUILD_INFO_H"
|
@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
cp -rpv ../ggml/src/ggml.c ./ggml.c
|
||||
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
||||
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
||||
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
@ -0,0 +1,77 @@
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
def sha256sum(file):
|
||||
block_size = 16 * 1024 * 1024 # 16 MB block size
|
||||
b = bytearray(block_size)
|
||||
file_hash = hashlib.sha256()
|
||||
mv = memoryview(b)
|
||||
with open(file, 'rb', buffering=0) as f:
|
||||
while True:
|
||||
n = f.readinto(mv)
|
||||
if not n:
|
||||
break
|
||||
file_hash.update(mv[:n])
|
||||
|
||||
return file_hash.hexdigest()
|
||||
|
||||
# Define the path to the llama directory (parent folder of script directory)
|
||||
llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
|
||||
|
||||
# Define the file with the list of hashes and filenames
|
||||
hash_list_file = os.path.join(llama_path, "SHA256SUMS")
|
||||
|
||||
# Check if the hash list file exists
|
||||
if not os.path.exists(hash_list_file):
|
||||
print(f"Hash list file not found: {hash_list_file}")
|
||||
exit(1)
|
||||
|
||||
# Read the hash file content and split it into an array of lines
|
||||
with open(hash_list_file, "r") as f:
|
||||
hash_list = f.read().splitlines()
|
||||
|
||||
# Create an array to store the results
|
||||
results = []
|
||||
|
||||
# Loop over each line in the hash list
|
||||
for line in hash_list:
|
||||
# Split the line into hash and filename
|
||||
hash_value, filename = line.split(" ")
|
||||
|
||||
# Get the full path of the file by joining the llama path and the filename
|
||||
file_path = os.path.join(llama_path, filename)
|
||||
|
||||
# Informing user of the progress of the integrity check
|
||||
print(f"Verifying the checksum of {file_path}")
|
||||
|
||||
# Check if the file exists
|
||||
if os.path.exists(file_path):
|
||||
# Calculate the SHA256 checksum of the file using hashlib
|
||||
file_hash = sha256sum(file_path)
|
||||
|
||||
# Compare the file hash with the expected hash
|
||||
if file_hash == hash_value:
|
||||
valid_checksum = "V"
|
||||
file_missing = ""
|
||||
else:
|
||||
valid_checksum = ""
|
||||
file_missing = ""
|
||||
else:
|
||||
valid_checksum = ""
|
||||
file_missing = "X"
|
||||
|
||||
# Add the results to the array
|
||||
results.append({
|
||||
"filename": filename,
|
||||
"valid checksum": valid_checksum,
|
||||
"file missing": file_missing
|
||||
})
|
||||
|
||||
|
||||
# Print column headers for results table
|
||||
print("\n" + "filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20))
|
||||
print("-" * 80)
|
||||
|
||||
# Output the results as a table
|
||||
for r in results:
|
||||
print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}")
|
@ -0,0 +1,154 @@
|
||||
// Unit tests for quantization specific functions - quantize, dequantize and dot product
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
||||
const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001;
|
||||
const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002;
|
||||
const float MAX_DOT_PRODUCT_ERROR = 0.02;
|
||||
|
||||
const char* RESULT_STR[] = {"ok", "FAILED"};
|
||||
|
||||
|
||||
// Generate synthetic data
|
||||
void generate_data(float offset, size_t n, float * dst) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
dst[i] = 0.1 + 2*cosf(i + offset);
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate RMSE between two float arrays
|
||||
float array_rmse(const float * a1, const float * a2, size_t n) {
|
||||
double sum = 0;
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
double diff = a1[i] - a2[i];
|
||||
sum += diff * diff;
|
||||
}
|
||||
return sqrtf(sum) / n;
|
||||
}
|
||||
|
||||
// Total quantization error on test data
|
||||
float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
|
||||
std::vector<uint8_t> tmp_q(2*test_size);
|
||||
std::vector<float> tmp_out(test_size);
|
||||
|
||||
qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
|
||||
qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
|
||||
return array_rmse(test_data, tmp_out.data(), test_size);
|
||||
}
|
||||
|
||||
// Total quantization error on test data
|
||||
float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
|
||||
std::vector<uint8_t> tmp_q(2*test_size);
|
||||
std::vector<float> tmp_out(test_size);
|
||||
std::vector<float> tmp_out_ref(test_size);
|
||||
|
||||
qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
|
||||
qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
|
||||
|
||||
qfns.quantize_row_q_reference(test_data, tmp_q.data(), test_size);
|
||||
qfns.dequantize_row_q(tmp_q.data(), tmp_out_ref.data(), test_size);
|
||||
|
||||
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
||||
}
|
||||
|
||||
float dot_product(const float * a1, const float * a2, size_t test_size) {
|
||||
double sum = 0;
|
||||
for (size_t i = 0; i < test_size; i++) {
|
||||
sum += a1[i] * a2[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Total dot product error
|
||||
float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
|
||||
std::vector<uint8_t> tmp_q1(2*test_size);
|
||||
std::vector<uint8_t> tmp_q2(2*test_size);
|
||||
|
||||
qfns.quantize_row_q (test_data1, tmp_q1.data(), test_size);
|
||||
qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size);
|
||||
|
||||
float result = INFINITY;
|
||||
qfns.vec_dot_q(test_size, &result, tmp_q1.data(), tmp_q2.data());
|
||||
|
||||
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
||||
|
||||
return fabsf(result - dot_ref) / test_size;
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
bool verbose = false;
|
||||
const size_t test_size = 32 * 128;
|
||||
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
if (arg == "-v") {
|
||||
verbose = true;
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> test_data(test_size);
|
||||
std::vector<float> test_data2(test_size);
|
||||
|
||||
generate_data(0.0, test_data.size(), test_data.data());
|
||||
generate_data(1.0, test_data2.size(), test_data2.data());
|
||||
|
||||
// Initialize GGML, ensures float conversion tables are initialized
|
||||
struct ggml_init_params ggml_params = {
|
||||
/* .mem_size = */ 1*1024,
|
||||
/* .mem_buffer = */ NULL,
|
||||
/* .no_alloc = */ true,
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(ggml_params);
|
||||
|
||||
int num_failed = 0;
|
||||
bool failed = false;
|
||||
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||
failed = !(total_error < MAX_QUANTIZATION_TOTAL_ERROR);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
|
||||
}
|
||||
|
||||
const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
|
||||
failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
|
||||
}
|
||||
|
||||
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
|
||||
failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (num_failed || verbose) {
|
||||
printf("%d tests failed\n", num_failed);
|
||||
}
|
||||
|
||||
ggml_free(ctx);
|
||||
|
||||
return num_failed > 0;
|
||||
}
|
@ -0,0 +1,310 @@
|
||||
// Benchmark quantization specific functions on synthetic data
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#undef NDEBUG
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <functional>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
#include <memory>
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define MAX_ALIGNMENT 64
|
||||
#define QK 32
|
||||
#define WARMUP 5
|
||||
#define ITERATIONS 10
|
||||
|
||||
#define L1_SIZE 32*128
|
||||
#define L2_SIZE 32*2048
|
||||
#define L3_SIZE 32*20480
|
||||
#define MEM_SIZE 32*2048000
|
||||
|
||||
struct quantize_perf_params {
|
||||
std::vector<std::string> include_types;
|
||||
std::vector<size_t> test_sizes;
|
||||
size_t alignment_offset = 0;
|
||||
bool op_quantize_row_q_reference = false;
|
||||
bool op_quantize_row_q = false;
|
||||
bool op_dequantize_row_q = false;
|
||||
bool op_quantize_row_q_dot = false;
|
||||
bool op_vec_dot_q = false;
|
||||
};
|
||||
|
||||
|
||||
#if defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#include <x86intrin.h>
|
||||
inline int64_t cpu_cycles() {
|
||||
// Rough way to detect new-ish CPUs
|
||||
#ifdef __POPCNT__
|
||||
unsigned int dummy;
|
||||
return __rdtscp(&dummy);
|
||||
#else
|
||||
return __rdtsc();
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define cpu_cycles() 0
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// Generate synthetic data
|
||||
void generate_data(float offset, size_t n, float * dst) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
dst[i] = 0.1 + 2*cosf(i + offset);
|
||||
}
|
||||
}
|
||||
|
||||
float gigabytes_per_second(size_t bytes, int64_t usecs) {
|
||||
return bytes / (float) usecs * 1000000 / (1024*1024*1024);
|
||||
}
|
||||
|
||||
void * align_with_offset(void * ptr, int offset) {
|
||||
size_t dummy_size = MAX_ALIGNMENT * 4;
|
||||
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
|
||||
}
|
||||
|
||||
void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)> function) {
|
||||
int64_t min_time_us = INT64_MAX;
|
||||
int64_t total_time_us = 0;
|
||||
int64_t min_time_cycles = INT64_MAX;
|
||||
int64_t total_time_cycles = 0;
|
||||
|
||||
for (int i = 0; i < WARMUP; i++) {
|
||||
function();
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < ITERATIONS; i++) {
|
||||
const int64_t start_time = ggml_time_us();
|
||||
const int64_t start_cycles = cpu_cycles();
|
||||
|
||||
function();
|
||||
|
||||
const int64_t end_cycles = cpu_cycles();
|
||||
const int64_t end_time = ggml_time_us();
|
||||
|
||||
total_time_cycles += end_cycles - start_cycles;
|
||||
min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
|
||||
total_time_us += end_time - start_time;
|
||||
min_time_us = std::min(min_time_us, end_time - start_time);
|
||||
}
|
||||
|
||||
printf(" min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size);
|
||||
printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * ITERATIONS));
|
||||
printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * ITERATIONS, total_time_us));
|
||||
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * ITERATIONS, total_time_us));
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
quantize_perf_params params {};
|
||||
|
||||
// read command line
|
||||
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
if (arg == "--size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
size_t size = std::stoi(argv[i]);
|
||||
if (size % 32 != 0) {
|
||||
fprintf(stderr, "error: size %zu not divisible by 32\n", size);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.test_sizes.push_back(size);
|
||||
} else if (arg == "-3") {
|
||||
// quick select sizes that probably fit in CPU caches
|
||||
params.test_sizes.push_back(L1_SIZE);
|
||||
params.test_sizes.push_back(L2_SIZE);
|
||||
params.test_sizes.push_back(L3_SIZE);
|
||||
} else if (arg == "-4") {
|
||||
// quick select cache sizes + memory
|
||||
params.test_sizes.push_back(L1_SIZE);
|
||||
params.test_sizes.push_back(L2_SIZE);
|
||||
params.test_sizes.push_back(L3_SIZE);
|
||||
params.test_sizes.push_back(MEM_SIZE);
|
||||
} else if (arg == "--op") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string op {argv[i]};
|
||||
if (op == "quantize_row_q_reference") {
|
||||
params.op_quantize_row_q_reference = true;
|
||||
} else if (op == "quantize_row_q") {
|
||||
params.op_quantize_row_q = true;
|
||||
} else if (op == "dequantize_row_q") {
|
||||
params.op_dequantize_row_q = true;
|
||||
} else if (op == "quantize_row_q_dot") {
|
||||
params.op_quantize_row_q_dot = true;
|
||||
} else if (op == "vec_dot_q") {
|
||||
params.op_vec_dot_q = true;
|
||||
} else {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else if (arg == "--type") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.include_types.push_back(argv[i]);
|
||||
} else if (arg == "--alignment-offset") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
int alignment = std::stoi(argv[i]);
|
||||
if (alignment < 0 || alignment > MAX_ALIGNMENT) {
|
||||
fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.alignment_offset = alignment;
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.test_sizes.empty()) {
|
||||
params.test_sizes.push_back(L1_SIZE);
|
||||
}
|
||||
if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
|
||||
params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
|
||||
}
|
||||
|
||||
std::sort(params.test_sizes.begin(), params.test_sizes.end());
|
||||
size_t largest = params.test_sizes.back();
|
||||
|
||||
std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
|
||||
std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
|
||||
std::vector<uint8_t> test_q1_v(largest*4 + MAX_ALIGNMENT*2);
|
||||
std::vector<uint8_t> test_q2_v(largest*4 + MAX_ALIGNMENT*2);
|
||||
std::vector<uint8_t> test_out_v(largest*4 + MAX_ALIGNMENT*2);
|
||||
|
||||
float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
|
||||
float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
|
||||
float * test_q1 = (float *) align_with_offset(test_q1_v.data(), params.alignment_offset);
|
||||
float * test_q2 = (float *) align_with_offset(test_q2_v.data(), params.alignment_offset);
|
||||
float * test_out = (float *) align_with_offset(test_out_v.data(), params.alignment_offset);
|
||||
|
||||
generate_data(0, largest, test_data1);
|
||||
generate_data(1, largest, test_data2);
|
||||
|
||||
|
||||
// Initialize GGML, ensures float conversion tables are initialized
|
||||
struct ggml_init_params ggml_params = {
|
||||
/* .mem_size = */ 1*1024,
|
||||
/* .mem_buffer = */ NULL,
|
||||
/* .no_alloc = */ true,
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(ggml_params);
|
||||
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
printf("%s\n", ggml_type_name(type));
|
||||
|
||||
if (params.op_quantize_row_q_reference) {
|
||||
printf(" quantize_row_q_reference\n");
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void ) {
|
||||
qfns.quantize_row_q_reference(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
if (params.op_quantize_row_q) {
|
||||
printf(" quantize_row_q\n");
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void ) {
|
||||
qfns.quantize_row_q(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
if (params.op_dequantize_row_q) {
|
||||
printf(" dequantize_row_q\n");
|
||||
qfns.quantize_row_q(test_data1, test_q1, largest);
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void ) {
|
||||
qfns.dequantize_row_q(test_q1, test_out, size);
|
||||
return test_out[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
if (params.op_quantize_row_q_dot) {
|
||||
printf(" quantize_row_q_dot\n");
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void ) {
|
||||
qfns.quantize_row_q_dot(test_data1, test_q1, size);
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
if (params.op_vec_dot_q) {
|
||||
printf(" vec_dot_q\n");
|
||||
qfns.quantize_row_q(test_data1, test_q1, largest);
|
||||
qfns.quantize_row_q(test_data2, test_q2, largest);
|
||||
for (size_t size : params.test_sizes) {
|
||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||
auto quantize_fn = [&](void ) {
|
||||
float result;
|
||||
qfns.vec_dot_q(size, &result, test_q1, test_q2);
|
||||
return result;
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,42 +0,0 @@
|
||||
#include "ggml.h"
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
|
||||
int main(void) {
|
||||
#define QK 32
|
||||
float src[QK];
|
||||
uint8_t dst[24];
|
||||
int64_t hist[16];
|
||||
|
||||
for (int i = 0; i < QK; i++) {
|
||||
src[i] = (float)(i + 1);
|
||||
}
|
||||
|
||||
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
|
||||
assert(size == 20);
|
||||
float max_result = ((float *)dst)[0];
|
||||
float max_expected = src[31] / ((1 << 3) - 1);
|
||||
assert(max_result == max_expected);
|
||||
for (int i = 0; i < QK; i++) {
|
||||
uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
|
||||
uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
|
||||
assert(q4_result == q4_expected);
|
||||
}
|
||||
|
||||
size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
|
||||
assert(size == 24);
|
||||
float delta_result = ((float *)dst)[0];
|
||||
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
|
||||
assert(delta_result == delta_expected);
|
||||
float min_result = ((float *)dst)[1];
|
||||
float min_expected = src[0];
|
||||
assert(min_result == min_expected);
|
||||
for (int i = 0; i < QK; i++) {
|
||||
uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
|
||||
uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
|
||||
assert(q4_result == q4_expected);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,199 @@
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
void dump(const llama_token_data_array * candidates) {
|
||||
for (size_t i = 0; i < candidates->size; i++) {
|
||||
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
|
||||
}
|
||||
}
|
||||
|
||||
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
|
||||
|
||||
|
||||
void test_top_k(const std::vector<float> & probs,
|
||||
const std::vector<float> & expected_probs,
|
||||
int k) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_top_k(nullptr, &candidates_p, k);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void test_top_p(const std::vector<float> & probs,
|
||||
const std::vector<float> & expected_probs,
|
||||
float p) {
|
||||
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_top_p(nullptr, &candidates_p, p);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void test_tfs(const std::vector<float> & probs,
|
||||
const std::vector<float> & expected_probs,
|
||||
float z) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_tail_free(nullptr, &candidates_p, z);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void test_typical(const std::vector<float> & probs,
|
||||
const std::vector<float> & expected_probs,
|
||||
float p) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_typical(nullptr, &candidates_p, p);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void test_repetition_penalty(
|
||||
const std::vector<float> & probs,
|
||||
const std::vector<llama_token> & last_tokens,
|
||||
const std::vector<float> & expected_probs,
|
||||
float penalty) {
|
||||
assert(probs.size() == expected_probs.size());
|
||||
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_repetition_penalty(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), penalty);
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-6);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void test_frequency_presence_penalty(
|
||||
const std::vector<float> & probs,
|
||||
const std::vector<llama_token> & last_tokens,
|
||||
const std::vector<float> & expected_probs,
|
||||
float alpha_frequency, float alpha_presence) {
|
||||
assert(probs.size() == expected_probs.size());
|
||||
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
// DUMP(&candidates_p);
|
||||
llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence);
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
// DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
ggml_time_init();
|
||||
|
||||
test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4}, 1);
|
||||
test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2}, 3);
|
||||
|
||||
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4}, 0);
|
||||
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3}, 0.7);
|
||||
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2, 0.1}, 1);
|
||||
|
||||
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3}, 0.25);
|
||||
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.75);
|
||||
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.99);
|
||||
|
||||
test_typical({0.97, 0.01, 0.01, 0.01}, {0.97}, 0.5);
|
||||
test_typical({0.4, 0.2, 0.2, 0.2}, {0.2, 0.2, 0.2}, 0.5);
|
||||
|
||||
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.25, 0.25, 0.25, 0.25, 0}, 50.0);
|
||||
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.5, 0.5, 0, 0, 0}, 50.0);
|
||||
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.5, 0.5, 0, 0, 0}, 50.0);
|
||||
|
||||
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.249997, 0.249997, 0.249997, 0.249997, 0.000011}, 5.0, 5.0);
|
||||
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.499966, 0.499966, 0.000023, 0.000023, 0.000023}, 5.0, 5.0);
|
||||
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.499977, 0.499977, 0.000023, 0.000023, 0.000000}, 5.0, 5.0);
|
||||
|
||||
printf("OK\n");
|
||||
}
|
Loading…
Reference in New Issue