py : new conversion script (#545)

Current status: Working, except for the latest GPTQ-for-LLaMa format
  that includes `g_idx`.  This turns out to require changes to GGML, so
  for now it only works if you use the `--outtype` option to dequantize it
  back to f16 (which is pointless except for debugging).

  I also included some cleanup for the C++ code.

  This script is meant to replace all the existing conversion scripts
  (including the ones that convert from older GGML formats), while also
  adding support for some new formats.  Specifically, I've tested with:

  - [x] `LLaMA` (original)
  - [x] `llama-65b-4bit`
  - [x] `alpaca-native`
  - [x] `alpaca-native-4bit`
  - [x] LLaMA converted to 'transformers' format using
        `convert_llama_weights_to_hf.py`
  - [x] `alpaca-native` quantized with `--true-sequential --act-order
        --groupsize 128` (dequantized only)
  - [x] same as above plus `--save_safetensors`
  - [x] GPT4All
  - [x] stock unversioned ggml
  - [x] ggmh

  There's enough overlap in the logic needed to handle these different
  cases that it seemed best to move to a single script.

  I haven't tried this with Alpaca-LoRA because I don't know where to find
  it.

  Useful features:

  - Uses multiple threads for a speedup in some cases (though the Python
    GIL limits the gain, and sometimes it's disk-bound anyway).

  - Combines split models into a single file (both the intra-tensor split
    of the original and the inter-tensor split of 'transformers' format
    files).  Single files are more convenient to work with and more
    friendly to future changes to use memory mapping on the C++ side.  To
    accomplish this without increasing memory requirements, it has some
    custom loading code which avoids loading whole input files into memory
    at once.

  - Because of the custom loading code, it no longer depends in PyTorch,
    which might make installing dependencies slightly easier or faster...
    although it still depends on NumPy and sentencepiece, so I don't know
    if there's any meaningful difference.  In any case, I also added a
    requirements.txt file to lock the dependency versions in case of any
    future breaking changes.

  - Type annotations checked with mypy.

  - Some attempts to be extra user-friendly:

      - The script tries to be forgiving with arguments, e.g. you can
        specify either the model file itself or the directory containing
        it.

      - The script doesn't depend on config.json / params.json, just in
        case the user downloaded files individually and doesn't have those
        handy.  But you still need tokenizer.model and, for Alpaca,
        added_tokens.json.

      - The script tries to give a helpful error message if
        added_tokens.json is missing.
pull/960/head
comex 1 year ago committed by GitHub
parent 0f07cacb05
commit 723dac55fa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -192,10 +192,10 @@ ls ./models
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
# install Python dependencies
python3 -m pip install torch numpy sentencepiece
python3 -m pip install -r requirements.txt
# convert the 7B model to ggml FP16 format
python3 convert-pth-to-ggml.py models/7B/ 1
python3 convert.py models/7B/
# quantize the model to 4-bits (using method 2 = q4_0)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2

@ -1,299 +0,0 @@
# Author: github.com/ductai199x
import argparse
import os
import struct
import numpy as np
import torch
from numba import njit
from tqdm.auto import tqdm
def read_header(fin):
values = struct.unpack("i" * 9, fin.read(4 * 9))
_, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
return {
"vocab_size": vocab_size,
"dim": dim,
"multiple_of": multiple_of,
"n_heads": n_heads,
"n_layers": n_layers,
}, ftype
def read_tokens(fin, vocab_size):
tokens = []
for _ in range(vocab_size):
text_len = struct.unpack("i", fin.read(4))[0]
text_bytes = fin.read(text_len)
try:
text = text_bytes.decode()
except UnicodeDecodeError:
text = text_bytes.decode(errors="replace")
score = struct.unpack("f", fin.read(4))[0]
tokens.append((text, score))
return tokens
@njit
def dequantize_weights_numba(fin_data, n_rows, n_cols):
qk = 32
nb = n_cols // qk
bs = 4 + (qk // 2)
weights = np.zeros((n_rows, n_cols), dtype=np.float32)
data_pos = 0
for row in range(n_rows):
for block in range(nb):
d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
data_pos += 4
packed_values = fin_data[data_pos : data_pos + (qk // 2)]
data_pos += qk // 2
for i in range(qk // 2):
packed_value = packed_values[i]
v0 = np.float32((packed_value & 0b00001111) - 8) * d
v1 = np.float32((packed_value >> 4) - 8) * d
weights[row, block * qk + 2 * i] = v0
weights[row, block * qk + 2 * i + 1] = v1
return weights
def dequantize_weights(fin, n_rows, n_cols):
qk = 32
nb = n_cols // qk
data_size = n_rows * n_cols // 2 + n_rows * nb * 4
fin_data = fin.read(data_size)
return dequantize_weights_numba(fin_data, n_rows, n_cols)
def read_variables(fin):
model = {}
pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
while True:
start_pos = fin.tell()
try:
n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
except struct.error:
break
shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
shape = shape[::-1]
name = fin.read(name_length).decode()
# ensure tensor data is aligned
tensor_data_offset = fin.tell()
tensor_data_offset = (tensor_data_offset + 31) & -32
fin.seek(tensor_data_offset)
if ftype_cur == 2:
# 4-bit quantized weights
dtype = np.uint8
data = dequantize_weights(fin, shape[0], shape[1])
data = data.reshape(shape)
elif ftype_cur == 0:
dtype = np.float32
data_size = np.prod(shape)
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
elif ftype_cur == 1:
dtype = np.float16
data_size = np.prod(shape)
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
pbar.update(fin.tell() - start_pos)
return model
def convert_to_hf_format(model, hparams):
# This works for llama 7B, need to test with other models
n_layers = hparams["n_layers"]
n_heads = hparams["n_heads"]
dim = hparams["dim"]
dims_per_head = dim // n_heads
base = 10000.0
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
# permute for sliced rotary
def permute(w):
return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
state_dict = {}
for layer_i in range(n_layers):
state_dict.update(
{
f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
model[f"layers.{layer_i}.attention.wq.weight"]
),
f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
model[f"layers.{layer_i}.attention.wk.weight"]
),
f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
f"layers.{layer_i}.attention.wv.weight"
],
f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
f"layers.{layer_i}.attention.wo.weight"
],
f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
f"layers.{layer_i}.feed_forward.w1.weight"
],
f"model.layers.{layer_i}.mlp.down_proj.weight": model[
f"layers.{layer_i}.feed_forward.w2.weight"
],
f"model.layers.{layer_i}.mlp.up_proj.weight": model[
f"layers.{layer_i}.feed_forward.w3.weight"
],
f"model.layers.{layer_i}.input_layernorm.weight": model[
f"layers.{layer_i}.attention_norm.weight"
],
f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
f"layers.{layer_i}.ffn_norm.weight"
],
}
)
state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
state_dict.update(
{
"model.embed_tokens.weight": model["tok_embeddings.weight"],
"model.norm.weight": model["norm.weight"],
"lm_head.weight": model["output.weight"],
}
)
return state_dict
def chat(model, hparams, llama_dir):
from transformers import (GenerationConfig, LlamaForCausalLM,
LlamaTokenizer, StoppingCriteria,
StoppingCriteriaList)
from transformers.models.llama.configuration_llama import LlamaConfig
class StoppingCriteriaSub(StoppingCriteria):
def __init__(self):
super().__init__()
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
print(tokenizer.decode(input_ids[0]), end="", flush=True)
if input_ids[0][-1] == 13:
return True
return False
config = LlamaConfig(
vocab_size=hparams["vocab_size"],
dim=hparams["dim"],
num_hidden_layers=hparams["n_layers"],
num_attention_heads=hparams["n_heads"],
)
llama = LlamaForCausalLM(config=config)
llama.load_state_dict(state_dict=model, strict=True)
tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
device = torch.device("cpu")
llama = llama.to(device)
ctx = """You are AI.
This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
User: Hello, AI.
AI: Hello! How can I assist you today?
"""
print(ctx.rstrip("\n"))
while True:
print("-" * 60)
prompt = input("User: ")
if ctx != "":
ctx = f"{ctx}User: {prompt}\n"
else:
ctx = f"{prompt}\nAI:"
ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
print("-" * 60)
if len(ctx.strip()) > 0:
input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
generation_config = GenerationConfig(
temperature=0.8,
top_p=0.95,
top_k=50,
repetition_penalty=1.1764,
)
with torch.no_grad():
generation_output = llama.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_length=2048,
do_sample=True,
stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
)
s = generation_output.sequences[0]
decoded = tokenizer.decode(s)
ctx = f"{decoded}\n"
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
)
parser.add_argument(
"--prefix",
"-p",
type=str,
required=True,
help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
)
parser.add_argument(
"--hf",
action="store_true",
help="Whether to save the model in the Hugging Face format. (default: False)",
)
parser.add_argument(
"--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
)
args = parser.parse_args()
llama_dir = os.path.abspath(f"{args.input_dir}/../")
ggml_files = sorted(
[f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
)
fin = open(ggml_files[0], "rb")
hparams, ftype = read_header(fin)
tokens = read_tokens(fin, hparams["vocab_size"])
model = read_variables(fin)
for f in tqdm(ggml_files[1:]):
fin = open(f, "rb")
read_header(fin)
read_tokens(fin, hparams["vocab_size"])
model.update(read_variables(fin))
if args.hf:
model = convert_to_hf_format(model, hparams)
pth_ckpt = {
"state_dict": model,
"hparams": hparams,
"tokens": tokens,
}
torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
if args.chat:
if not args.hf:
model = convert_to_hf_format(model, hparams)
chat(model, hparams, llama_dir)
if __name__ == "__main__":
main()

@ -1,107 +0,0 @@
#!/usr/bin/env python3
#
# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
#
# Original by https://github.com/eiz
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
import argparse
import glob
import os
import struct
import sys
from sentencepiece import SentencePieceProcessor
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
def parse_args():
parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
return parser.parse_args()
def read_header(f_in):
struct_fmt = "i" * (3 + len(HPARAMS))
struct_size = struct.calcsize(struct_fmt)
buf = f_in.read(struct_size)
return struct.unpack(struct_fmt, buf)
def write_header(f_out, header):
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
if magic != 0x67676d6c:
raise Exception('Invalid file magic. Must be an old style ggml file.')
values = [
0x67676d66, # magic: ggml in hex
1, # file version
vocab_size,
dim,
multiple_of,
n_heads,
n_layers,
rot,
ftype
]
f_out.write(struct.pack("i" * len(values), *values))
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print(f"Invalid token: {piece}")
sys.exit(1)
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
# TODO: GPT4All - add extra <pad> token
text = "<pad>".encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", 0.0))
def read_tokens(f_in, tokenizer):
for i in range(tokenizer.vocab_size()):
len_b = f_in.read(4)
(length,) = struct.unpack("i", len_b)
f_in.read(length)
def copy_all_data(f_out, f_in):
while True:
buf = f_in.read(1024 * 1024)
if not buf:
break
f_out.write(buf)
def convert_one_file(path_in, tokenizer):
path_tmp = f"{path_in}.tmp"
path_orig= f"{path_in}.orig"
print(f"converting {path_in}")
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
write_header(f_out, read_header(f_in))
read_tokens(f_in, tokenizer)
write_tokens(f_out, tokenizer)
copy_all_data(f_out, f_in)
os.rename(path_in, path_orig)
os.rename(path_tmp, path_in)
def main():
args = parse_args()
tokenizer = SentencePieceProcessor(args.tokenizer_model)
convert_one_file(args.gpt4all_model, tokenizer)
if __name__ == "__main__":
main()

@ -1,172 +0,0 @@
# Convert a GPTQ quantized LLaMA model to a ggml compatible file
# Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
#
import os
import re
import sys
import json
import struct
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor
if len(sys.argv) != 4:
print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
sys.exit(1)
fname_model = sys.argv[1]
fname_tokenizer = sys.argv[2]
dir_out = sys.argv[3]
model = torch.load(fname_model, map_location="cpu")
n_vocab, n_embd = model['model.embed_tokens.weight'].shape
n_layer = 1 + max(int(m.group(1)) for name in model
if (m := re.match(r'model\.layers\.([0-9]+)', name)))
# hardcoded:
n_mult = 256
n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
tokenizer = SentencePieceProcessor(fname_tokenizer)
assert tokenizer.vocab_size() == n_vocab
fname_out = sys.argv[3]
fout = open(fname_out, "wb")
fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", n_vocab))
fout.write(struct.pack("i", n_embd))
fout.write(struct.pack("i", n_mult))
fout.write(struct.pack("i", n_head))
fout.write(struct.pack("i", n_layer))
fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
fout.write(struct.pack("i", 4))
# This loop unchanged from convert-pth-to-ggml.py:
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print(f"Invalid token: {piece}")
sys.exit(1)
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def write_header(shape, dst_name, ftype_cur):
sname = dst_name.encode()
fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
fout.write(sname)
# ensure tensor data is aligned
tensor_data_offset = fout.tell()
tensor_data_offset = (tensor_data_offset + 31) & -32
fout.seek(tensor_data_offset)
def convert_non_q4(src_name, dst_name):
v = model[src_name]
shape = v.shape
print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
if len(shape) == 1:
print(" Converting to float32")
v = v.to(torch.float32)
ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
# header
write_header(shape, dst_name, ftype_cur)
# data
v.numpy().tofile(fout)
def convert_q4(src_name, dst_name, permute=False):
zeros = model[f"{src_name}.zeros"].numpy()
scales = model[f"{src_name}.scales"].numpy()
bias = model[f"{src_name}.bias"].numpy()
qweight = model[f"{src_name}.qweight"].numpy().T # transpose
# Q4_1 does not support bias; good thing the bias is always all zeros.
assert not np.any(bias)
# Each int32 item is actually 8 int4 items packed together, and it's transposed.
shape = (qweight.shape[0], qweight.shape[1] * 8)
print(f"Processing Q4 variable: {src_name} with shape: {shape}")
# The output format has the int4 weights in groups of 32 rather than 8.
# It looks like this:
# For each row:
# For each group of 32 columns:
# - addend (float32, 4 bytes)
# - scale (float32, 4 bytes)
# - weights (int4 * 32, 16 bytes)
# Note that in the input, the scales and addends are shared between all
# the columns in a row, so we end up wasting quite a bit of memory with
# repeated scales and addends.
addends = -zeros # flip sign
# Since the output format is mixed between integers and floats, we have
# to hackily view the floats as int32s just so numpy will let us
# concatenate them.
addends_view = addends.view(dtype=np.int32)
scales_view = scales.view(dtype=np.int32)
# Split into groups of 4 columns (i.e. 32 columns of quantized data):
grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
# Repeat addends and scales:
addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
if permute:
# Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
# This can be done after the above conversion because it doesn't affect column order/layout.
blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
.swapaxes(1, 2)
.reshape(blob.shape))
# header
write_header(shape, dst_name, 3) # ftype = Q4_1
# data
blob.tofile(fout)
convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
convert_non_q4("model.norm.weight", "norm.weight")
convert_non_q4("lm_head.weight", "output.weight")
for i in range(n_layer):
convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight")
convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
fout.close()
print(f"Done. Output file: {fname_out}")
print()

@ -1,274 +1,11 @@
# Convert a LLaMA model checkpoint to a ggjt compatible file
#
# Load the model using Torch
# Iterate over all variables and write them to a binary file.
#
# For each variable, write the following:
# - Number of dimensions (int)
# - Name length (int)
# - Dimensions (int[n_dims])
# - Name (char[name_length])
# - Data (float[n_dims])
#
# At the start of the ggml file we write the model parameters
# and vocabulary.
#
# Compatibility stub
import argparse
import os
import sys
import json
import struct
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor
import convert
QK = 32
GGML_TYPE_Q4_0 = 0
GGML_TYPE_Q4_1 = 1
GGML_TYPE_I8 = 2
GGML_TYPE_I16 = 3
GGML_TYPE_I32 = 4
GGML_TYPE_F16 = 5
GGML_TYPE_F32 = 6
WTYPES = {
0: GGML_TYPE_F32,
1: GGML_TYPE_F16,
2: GGML_TYPE_Q4_0,
3: GGML_TYPE_Q4_1,
}
GGML_BLCK_SIZE = {
GGML_TYPE_Q4_0: QK,
GGML_TYPE_Q4_1: QK,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 1,
GGML_TYPE_I32: 1,
GGML_TYPE_F16: 1,
GGML_TYPE_F32: 1,
}
GGML_TYPE_SIZE = {
GGML_TYPE_Q4_0: 4 + QK//2,
GGML_TYPE_Q4_1: 4*2 + QK//2,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 2,
GGML_TYPE_I32: 4,
GGML_TYPE_F16: 2,
GGML_TYPE_F32: 4,
}
def ggml_nelements(shape):
r = 1
for i in shape:
r *= i
return r
def ggml_nbytes(shape, ftype):
x = ggml_nelements(shape)
t = WTYPES[ftype]
x *= GGML_TYPE_SIZE[t]
x //= GGML_BLCK_SIZE[t]
return x
def parse_args():
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
parser.add_argument('dir_model', help='directory containing the model checkpoint')
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
return parser.parse_args()
def get_n_parts(dim):
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
n_parts = mappings.get(dim)
if n_parts is None:
print(f"Invalid dim: {dim}")
sys.exit(1)
print(f"n_parts = {n_parts}\n")
return n_parts
def load_hparams_and_tokenizer(dir_model):
# `dir_model` is something like `models/7B` or `models/7B/`.
# "tokenizer.model" is expected under model's parent dir.
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
# Let's use the model's parent dir directly.
model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
fname_hparams = f"{dir_model}/params.json"
fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
with open(fname_hparams, "r") as f:
hparams = json.load(f)
print(hparams)
tokenizer = SentencePieceProcessor(fname_tokenizer)
hparams.update({"vocab_size": tokenizer.vocab_size()})
return hparams, tokenizer
def write_header(fout, hparams, ftype):
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
values = [
0x67676a74, # magic: ggjt in hex
1, # file version
*[hparams[key] for key in keys],
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
ftype
]
fout.write(struct.pack("i" * len(values), *values))
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print(f"Invalid token: {piece}")
sys.exit(1)
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def process_and_write_variables(fout, model, ftype, part_id, n_parts):
for name, datao in model.items():
if name.endswith("freqs"):
continue
# remove dimensions with a single element
data = datao.numpy().squeeze()
partshape = data.shape
n_dims = len(data.shape)
assert n_dims in (1, 2)
print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
# coerce single-dimensional tensors from float16 to float32
ftype_cur = 1
if ftype == 0 or n_dims == 1:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
# determine dimension along which multipart tensor is sharded
#
# split_dim 0 regex:
# - output.*
# - layers.*.attention.wq.weight
# - layers.*.attention.wk.weight
# - layers.*.attention.wv.weight
# - layers.*.feed_forward.w1.weight
# - layers.*.feed_forward.w3.weight
#
# split_dim 1 regex:
# - tok_embeddings.*
# - layers.*.attention.wo.weight
# - layers.*.feed_forward.w2.weight
#
if n_dims > 1:
split_dim = 1
if "tok_embeddings" in name:
split_dim = 1
elif "layers" in name:
if "attention.wo.weight" in name:
split_dim = 1
elif "feed_forward.w2.weight" in name:
split_dim = 1
else:
split_dim = 0
elif "output" in name:
split_dim = 0
# output tensor header
fullshape = list(partshape)
if n_dims > 1:
fullshape[split_dim] *= n_parts
sname = name.encode()
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
for dim in reversed(fullshape):
fout.write(struct.pack("i", dim))
fout.write(sname)
# ensure tensor data is aligned
tensor_data_offset = fout.tell()
while tensor_data_offset % QK != 0:
fout.write(struct.pack("B", 0))
tensor_data_offset += 1
# output unified mappable tensor data
if n_dims == 1 or n_parts == 1:
# copy tensor which we thankfully received in one piece
if part_id == 0:
data.tofile(fout)
elif split_dim == 0:
# reassemble multifile tensor containing some of the rows
rows_per_chunk = partshape[0]
current_row = part_id * rows_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset = current_row * bytes_per_row
fout.seek(tensor_data_offset + offset)
data.tofile(fout)
elif split_dim == 1:
# reassemble multifile tensor containing some of the cols
cols_per_chunk = partshape[1]
current_col = part_id * cols_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset_current_col = current_col // blck_size * type_size
for row in range(partshape[0]):
offset_row = row * bytes_per_row
offset = offset_row + offset_current_col
fout.seek(tensor_data_offset + offset)
data[row].tofile(fout)
# advance file position to next tensor
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
def main():
args = parse_args()
dir_model = args.dir_model
ftype = args.ftype
ftype_str = ["f32", "f16"]
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
print(args)
# if only writing vocab to file
if args.vocab_only:
fname_model = f"{dir_model}/consolidated.00.pth"
fname_out = f"{dir_model}/ggml-vocab.bin"
print(f"Extracting only the vocab from '{fname_model}'\n")
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
print(f"Done. Output file: {fname_out}\n")
return
n_parts = get_n_parts(hparams["dim"])
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
# we output a single file for ggml
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
offset_of_tensors = fout.tell()
# the tensors we load could be split across multiple files
for part_id in range(n_parts):
fout.seek(offset_of_tensors)
print(f"Processing part {part_id+1} of {n_parts}\n")
fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
model = torch.load(fname_model, map_location="cpu")
process_and_write_variables(fout, model, ftype, part_id, n_parts)
del model
print(f"Done. Output file: {fname_out}\n")
if __name__ == "__main__":
main()
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
parser.add_argument('dir_model', help='directory containing the model checkpoint')
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
args = parser.parse_args()
convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])

@ -1,100 +0,0 @@
#!/usr/bin/env python3
# Original by https://github.com/eiz
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
import argparse
import glob
import os
import struct
import sys
from sentencepiece import SentencePieceProcessor
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
def parse_args():
parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
parser.add_argument('dir_model', help='directory containing ggml .bin files')
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
return parser.parse_args()
def read_header(f_in):
struct_fmt = "i" * (3 + len(HPARAMS))
struct_size = struct.calcsize(struct_fmt)
buf = f_in.read(struct_size)
return struct.unpack(struct_fmt, buf)
def write_header(f_out, header):
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
if magic != 0x67676d6c:
raise Exception('Invalid file magic. Must be an old style ggml file.')
values = [
0x67676d66, # magic: ggml in hex
1, # file version
vocab_size,
dim,
multiple_of,
n_heads,
n_layers,
rot,
ftype
]
f_out.write(struct.pack("i" * len(values), *values))
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
print(f"Invalid token: {piece}")
sys.exit(1)
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def read_tokens(f_in, tokenizer):
for i in range(tokenizer.vocab_size()):
len_b = f_in.read(4)
(length,) = struct.unpack("i", len_b)
f_in.read(length)
def copy_all_data(f_out, f_in):
while True:
buf = f_in.read(1024 * 1024)
if not buf:
break
f_out.write(buf)
def convert_one_file(path_in, tokenizer):
path_tmp = f"{path_in}.tmp"
path_orig= f"{path_in}.orig"
print(f"converting {path_in}")
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
write_header(f_out, read_header(f_in))
read_tokens(f_in, tokenizer)
write_tokens(f_out, tokenizer)
copy_all_data(f_out, f_in)
os.rename(path_in, path_orig)
os.rename(path_tmp, path_in)
def main():
args = parse_args()
files = []
files.extend(glob.glob(f"{args.dir_model}/*.bin"))
files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
tokenizer = SentencePieceProcessor(args.tokenizer_model)
for file in files:
convert_one_file(file, tokenizer)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

@ -1,311 +0,0 @@
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
#
# We caused a breaking change to the file format on 2023-03-30 in:
# https://github.com/ggerganov/llama.cpp/pull/613
#
# (1) If you still have the Meta LLaMA .pth files, then close this
# file now; you can just run `convert-pth-to-ggml.py` again to
# migrate to the new format. The tool is easier to use too. It
# isn't necessary anymore to manage split output files because
# the new format always combines things into a single file.
#
# (2) If you deleted the Meta LLaMA .pth files due to save on disk
# space, then this tool is intended to help you. Please check
# out the instructions below.
#
# USAGE
#
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
#
# PREREQUISITES
#
# pip install numpy
# cd llama.cpp
# make -j4
#
# EXAMPLE (7B MODEL)
#
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
#
# # check that it works
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
#
# # you can delete the old files
# rm -f models/7B/ggml-model-f16.bin
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
#
# EXAMPLE (13B MODEL)
#
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
#
# # check that it works
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
#
# # you can delete the old files
# rm -f models/13B/ggml-model-f16.bin*
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
#
import argparse
import os
import sys
import json
import struct
import numpy as np
QK = 32
GGML_TYPE_Q4_0 = 0
GGML_TYPE_Q4_1 = 1
GGML_TYPE_I8 = 2
GGML_TYPE_I16 = 3
GGML_TYPE_I32 = 4
GGML_TYPE_F16 = 5
GGML_TYPE_F32 = 6
WTYPE_NAMES = {
0: "F32",
1: "F16",
2: "Q4_0",
3: "Q4_1",
}
WTYPES = {
0: GGML_TYPE_F32,
1: GGML_TYPE_F16,
2: GGML_TYPE_Q4_0,
3: GGML_TYPE_Q4_1,
}
GGML_BLCK_SIZE = {
GGML_TYPE_Q4_0: QK,
GGML_TYPE_Q4_1: QK,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 1,
GGML_TYPE_I32: 1,
GGML_TYPE_F16: 1,
GGML_TYPE_F32: 1,
}
GGML_TYPE_SIZE = {
GGML_TYPE_Q4_0: 4 + QK//2,
GGML_TYPE_Q4_1: 4*2 + QK//2,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 2,
GGML_TYPE_I32: 4,
GGML_TYPE_F16: 2,
GGML_TYPE_F32: 4,
}
HPARAMS = [
'magic', # int32
'version', # int32
'n_vocab', # int32
'n_embd', # int32
'n_mult', # int32
'n_head', # int32
'n_layer', # int32
'n_rot', # int32
'f16', # int32
]
def read_hparams(fin):
struct_fmt = "i" * len(HPARAMS)
struct_size = struct.calcsize(struct_fmt)
buf = fin.read(struct_size)
ints = struct.unpack(struct_fmt, buf)
hparams = dict(zip(HPARAMS, ints))
return hparams
def write_hparams(fout, hparams):
struct_fmt = "i" * len(HPARAMS)
struct_size = struct.calcsize(struct_fmt)
ints = [hparams[h] for h in HPARAMS]
fout.write(struct.pack(struct_fmt, *ints))
def read_tokens(fin, hparams):
tokens = []
for i in range(hparams['n_vocab']):
len_b = fin.read(4)
(length,) = struct.unpack("i", len_b)
word = fin.read(length)
score_b = fin.read(4)
(score,) = struct.unpack("f", score_b)
tokens.append((word, score))
return tokens
def write_tokens(fout, tokens):
for word, score in tokens:
fout.write(struct.pack("i", len(word)))
fout.write(word)
fout.write(struct.pack("f", score))
def ggml_nelements(shape):
r = 1
for i in shape:
r *= i
return r
def ggml_nbytes(shape, ftype):
x = ggml_nelements(shape)
t = WTYPES[ftype]
x *= GGML_TYPE_SIZE[t]
x //= GGML_BLCK_SIZE[t]
return x
def copy_tensors(fin, fout, part_id, n_parts):
while True:
b = fin.read(4)
if not b: break
(n_dims,) = struct.unpack("i", b)
b = fin.read(4)
(length,) = struct.unpack("i", b)
b = fin.read(4)
(ftype,) = struct.unpack("i", b)
assert n_dims in (1, 2)
partshape = list(range(n_dims))
for i in range(n_dims):
b = fin.read(4)
partshape[i] = struct.unpack("i", b)[0]
partshape = list(reversed(partshape))
name = fin.read(length)
data = fin.read(ggml_nbytes(partshape, ftype))
blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
# determine dimension along which multipart tensor is sharded
#
# split_dim 0 regex:
# - output.*
# - layers.*.attention.wq.weight
# - layers.*.attention.wk.weight
# - layers.*.attention.wv.weight
# - layers.*.feed_forward.w1.weight
# - layers.*.feed_forward.w3.weight
#
# split_dim 1 regex:
# - tok_embeddings.*
# - layers.*.attention.wo.weight
# - layers.*.feed_forward.w2.weight
#
if n_dims > 1:
split_dim = 1
if b"tok_embeddings" in name:
split_dim = 1
elif b"layers" in name:
if b"attention.wo.weight" in name:
split_dim = 1
elif b"feed_forward.w2.weight" in name:
split_dim = 1
else:
split_dim = 0
elif b"output" in name:
split_dim = 0
# output tensor header
fullshape = list(partshape)
if n_dims > 1:
fullshape[split_dim] *= n_parts
fout.write(struct.pack("iii", n_dims, len(name), ftype))
for dim in reversed(fullshape):
fout.write(struct.pack("i", dim))
fout.write(name)
# ensure tensor data is aligned
tensor_data_offset = fout.tell()
while tensor_data_offset % QK != 0:
fout.write(struct.pack("B", 0))
tensor_data_offset += 1
# output unified mappable tensor data
if n_dims == 1 or n_parts == 1:
# copy tensor which we thankfully received in one piece
if part_id == 0:
fout.write(data)
elif split_dim == 0:
# reassemble multifile tensor containing some of the rows
rows_per_chunk = partshape[0]
current_row = part_id * rows_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset = current_row * bytes_per_row
fout.seek(tensor_data_offset + offset)
fout.write(data)
elif split_dim == 1:
# reassemble multifile tensor containing some of the cols
cols_per_chunk = partshape[1]
current_col = part_id * cols_per_chunk
bpr = partshape[1] // blck_size * type_size
bytes_per_row = fullshape[1] // blck_size * type_size
offset_current_col = current_col // blck_size * type_size
for row in range(partshape[0]):
offset_row = row * bytes_per_row
offset = offset_row + offset_current_col
fout.seek(tensor_data_offset + offset)
fout.write(data[row * bpr:row * bpr + bpr])
# advance file position to next tensor
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
def parse_args():
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
parser.add_argument('fout_path', help='your new ggjt file name')
return parser.parse_args()
def main():
args = parse_args()
assert args.fin_path
assert args.fout_path
assert args.fin_path != args.fout_path
with open(args.fin_path, "rb") as fin:
hparams = read_hparams(fin)
tokens = read_tokens(fin, hparams)
if hparams['magic'] == 0x67676a74: # ggjt
print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
sys.exit(1)
if hparams['magic'] != 0x67676d66: # ggmf
print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
sys.exit(1)
hparams['magic'] = 0x67676a74 # ggjt
# count number of multipart files by convention
n_parts = 1
while True:
if os.path.exists(f"{args.fin_path}.{n_parts}"):
n_parts += 1
else:
break
# we output a single file for ggml
with open(args.fout_path, "wb") as fout:
write_hparams(fout, hparams)
write_tokens(fout, tokens)
offset_of_tensors = fout.tell()
# the tensors we load could be split across multiple files
for part_id in range(n_parts):
fout.seek(offset_of_tensors)
print(f"Processing part {part_id+1} of {n_parts}\n")
fin_path = args.fin_path
if part_id > 0:
fin_path += f".{part_id}"
with open(fin_path, "rb") as fin:
read_tokens(fin, read_hparams(fin))
copy_tensors(fin, fout, part_id, n_parts)
print(f"Done. Output file: {args.fout_path}\n")
if __name__ == "__main__":
main()

@ -0,0 +1,2 @@
numpy==1.24
sentencepiece==0.1.97
Loading…
Cancel
Save