@ -680,13 +680,17 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
for ( int i = 0 ; i < nb ; i + + ) {
float amax = 0.0f ; // absolute max
float max = 0.0f ;
for ( int l = 0 ; l < QK4_0 ; l + + ) {
const float v = x [ i * QK4_0 + l ] ;
amax = MAX ( amax , fabsf ( v ) ) ;
if ( amax < fabsf ( v ) ) {
amax = fabsf ( v ) ;
max = v ;
}
}
const float d = amax / ( ( 1 < < 3 ) - 1 ) ;
const float d = max / - 8 ;
const float id = d ? 1.0f / d : 0.0f ;
y [ i ] . d = d ;
@ -695,8 +699,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
const float v0 = x [ i * QK4_0 + l + 0 ] * id ;
const float v1 = x [ i * QK4_0 + l + 1 ] * id ;
const uint8_t vi0 = ( int8_t ) roundf ( v0 ) + 8 ;
const uint8_t vi1 = ( int8_t ) roundf ( v1 ) + 8 ;
const uint8_t vi0 = MIN ( 15 , ( int8_t ) roundf ( v0 ) + 8 ) ;
const uint8_t vi1 = MIN ( 15 , ( int8_t ) roundf ( v1 ) + 8 ) ;
assert ( vi0 < 16 ) ;
assert ( vi1 < 16 ) ;
@ -716,28 +720,42 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
# if defined(__POWER9_VECTOR__)
const vector float v85 = vec_splats ( 8.5f ) ;
const vector signed int v15 = vec_splats ( 15 ) ;
for ( int i = 0 ; i < nb ; i + + ) {
float amax = 0.0f ; // absolute max
float max = 0.0f ;
float min = 0.0f ;
vector float srcv [ 8 ] ;
vector float asrc v[ 8 ] ;
vector float amax v[ 8 ] ;
vector float max v[ 8 ] ;
vector float min v[ 8 ] ;
for ( int l = 0 ; l < 8 ; l + + ) srcv [ l ] = * ( vector float * ) ( x + i * 32 + 4 * l ) ;
for ( int l = 0 ; l < 8 ; l + + ) asrcv [ l ] = vec_abs ( srcv [ l ] ) ;
for ( int l = 0 ; l < 4 ; l + + ) amaxv [ 2 * l ] = vec_max ( asrcv [ 2 * l ] , asrcv [ 2 * l + 1 ] ) ;
//for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]);
amaxv [ 0 ] = vec_max ( amaxv [ 0 ] , amaxv [ 2 ] ) ;
amaxv [ 4 ] = vec_max ( amaxv [ 4 ] , amaxv [ 6 ] ) ;
//for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]);
amaxv [ 0 ] = vec_max ( amaxv [ 0 ] , amaxv [ 4 ] ) ;
amax = MAX (
MAX ( vec_extract ( amaxv [ 0 ] , 0 ) , vec_extract ( amaxv [ 0 ] , 1 ) ) ,
MAX ( vec_extract ( amaxv [ 0 ] , 2 ) , vec_extract ( amaxv [ 0 ] , 3 ) ) ) ;
const float d = amax / ( ( 1 < < 3 ) - 1 ) ;
//for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
for ( int l = 0 ; l < 4 ; l + + ) maxv [ 2 * l ] = vec_max ( asrcv [ 2 * l ] , asrcv [ 2 * l + 1 ] ) ;
//for (int l = 0; l < 2; l++) maxv[4*l] = vec_max(maxv[4*l], maxv[4*l+2]);
maxv [ 0 ] = vec_max ( maxv [ 0 ] , maxv [ 2 ] ) ;
maxv [ 4 ] = vec_max ( maxv [ 4 ] , maxv [ 6 ] ) ;
//for (int l = 0; l < 1; l++) maxv[8*l] = vec_max(maxv[8*l], maxv[8*l+4]);
maxv [ 0 ] = vec_max ( maxv [ 0 ] , maxv [ 4 ] ) ;
for ( int l = 0 ; l < 4 ; l + + ) minv [ 2 * l ] = vec_min ( asrcv [ 2 * l ] , asrcv [ 2 * l + 1 ] ) ;
//for (int l = 0; l < 2; l++) minv[4*l] = vec_min(minv[4*l], minv[4*l+2]);
minv [ 0 ] = vec_min ( minv [ 0 ] , minv [ 2 ] ) ;
minv [ 4 ] = vec_min ( minv [ 4 ] , minv [ 6 ] ) ;
//for (int l = 0; l < 1; l++) minv[8*l] = vec_min(minv[8*l], minv[8*l+4]);
minv [ 0 ] = vec_min ( minv [ 0 ] , minv [ 4 ] ) ;
max = MAX (
MAX ( vec_extract ( maxv [ 0 ] , 0 ) , vec_extract ( maxv [ 0 ] , 1 ) ) ,
MAX ( vec_extract ( maxv [ 0 ] , 2 ) , vec_extract ( maxv [ 0 ] , 3 ) ) ) ;
min = MIN (
MIN ( vec_extract ( minv [ 0 ] , 0 ) , vec_extract ( minv [ 0 ] , 1 ) ) ,
MIN ( vec_extract ( minv [ 0 ] , 2 ) , vec_extract ( minv [ 0 ] , 3 ) ) ) ;
const float magnitude = max > = fabsf ( min ) ? max : min ;
const float d = magnitude / - 8 ;
const float id = d ? 1.0 / d : 0.0 ;
y [ i ] . d = d ;
@ -747,27 +765,33 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
for ( int l = 0 ; l < 8 ; l + + ) {
const vector float vf = vec_madd ( srcv [ l ] , vid , v85 ) ;
const vector signed int vi = vec_signed ( vf ) ;
const vector signed int vc = vec_min ( vi , v15 ) ;
pb [ 2 * l + 0 ] = vec_extract ( v i, 0 ) | ( vec_extract ( vi , 1 ) < < 4 ) ;
pb [ 2 * l + 1 ] = vec_extract ( v i, 2 ) | ( vec_extract ( vi , 3 ) < < 4 ) ;
pb [ 2 * l + 0 ] = vec_extract ( v c, 0 ) | ( vec_extract ( vc , 1 ) < < 4 ) ;
pb [ 2 * l + 1 ] = vec_extract ( v c, 2 ) | ( vec_extract ( vc , 3 ) < < 4 ) ;
}
}
# elif __ARM_NEON
for ( int i = 0 ; i < nb ; i + + ) {
float32x4_t srcv [ 8 ] ;
float32x4_t asrc v[ 8 ] ;
float32x4_t amax v[ 8 ] ;
float32x4_t max v[ 8 ] ;
float32x4_t min v[ 8 ] ;
for ( int l = 0 ; l < 8 ; l + + ) srcv [ l ] = vld1q_f32 ( x + i * 32 + 4 * l ) ;
for ( int l = 0 ; l < 8 ; l + + ) asrcv [ l ] = vabsq_f32 ( srcv [ l ] ) ;
for ( int l = 0 ; l < 4 ; l + + ) a maxv[ 2 * l ] = vmaxq_f32 ( a srcv[ 2 * l ] , a srcv[ 2 * l + 1 ] ) ;
for ( int l = 0 ; l < 2 ; l + + ) a maxv[ 4 * l ] = vmaxq_f32 ( a maxv[ 4 * l ] , a maxv[ 4 * l + 2 ] ) ;
for ( int l = 0 ; l < 1 ; l + + ) a maxv[ 8 * l ] = vmaxq_f32 ( a maxv[ 8 * l ] , a maxv[ 8 * l + 4 ] ) ;
for ( int l = 0 ; l < 4 ; l + + ) maxv[ 2 * l ] = vmaxq_f32 ( srcv[ 2 * l ] , srcv[ 2 * l + 1 ] ) ;
for ( int l = 0 ; l < 2 ; l + + ) maxv[ 4 * l ] = vmaxq_f32 ( maxv[ 4 * l ] , maxv[ 4 * l + 2 ] ) ;
for ( int l = 0 ; l < 1 ; l + + ) maxv[ 8 * l ] = vmaxq_f32 ( maxv[ 8 * l ] , maxv[ 8 * l + 4 ] ) ;
const float amax = vmaxvq_f32 ( amaxv [ 0 ] ) ;
for ( int l = 0 ; l < 4 ; l + + ) minv [ 2 * l ] = vminq_f32 ( srcv [ 2 * l ] , srcv [ 2 * l + 1 ] ) ;
for ( int l = 0 ; l < 2 ; l + + ) minv [ 4 * l ] = vminq_f32 ( minv [ 4 * l ] , minv [ 4 * l + 2 ] ) ;
for ( int l = 0 ; l < 1 ; l + + ) minv [ 8 * l ] = vminq_f32 ( minv [ 8 * l ] , minv [ 8 * l + 4 ] ) ;
const float d = amax / ( ( 1 < < 3 ) - 1 ) ;
const float max = vmaxvq_f32 ( maxv [ 0 ] ) ;
const float min = vminvq_f32 ( minv [ 0 ] ) ;
const float magnitude = max > = fabsf ( min ) ? max : min ;
const float d = magnitude / - 8 ;
const float id = d ? 1.0f / d : 0.0f ;
y [ i ] . d = d ;
@ -776,9 +800,10 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
const float32x4_t v = vmulq_n_f32 ( srcv [ l ] , id ) ;
const float32x4_t vf = vaddq_f32 ( v , vdupq_n_f32 ( 8.5f ) ) ;
const int32x4_t vi = vcvtq_s32_f32 ( vf ) ;
const int32x4_t vc = vminq_s32 ( vi , vdupq_n_s32 ( 15 ) ) ;
y [ i ] . qs [ 2 * l + 0 ] = vgetq_lane_s32 ( v i, 0 ) | ( vgetq_lane_s32 ( vi , 1 ) < < 4 ) ;
y [ i ] . qs [ 2 * l + 1 ] = vgetq_lane_s32 ( v i, 2 ) | ( vgetq_lane_s32 ( vi , 3 ) < < 4 ) ;
y [ i ] . qs [ 2 * l + 0 ] = vgetq_lane_s32 ( v c, 0 ) | ( vgetq_lane_s32 ( vc , 1 ) < < 4 ) ;
y [ i ] . qs [ 2 * l + 1 ] = vgetq_lane_s32 ( v c, 2 ) | ( vgetq_lane_s32 ( vc , 3 ) < < 4 ) ;
}
}
# elif defined(__AVX2__)
@ -790,22 +815,31 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
__m256 v3 = _mm256_loadu_ps ( x + 24 ) ;
x + = 32 ;
// Compute max(abs(e)) for the block
const __m256 signBit = _mm256_set1_ps ( - 0.0f ) ;
__m256 maxAbs = _mm256_andnot_ps ( signBit , v0 ) ;
maxAbs = _mm256_max_ps ( maxAbs , _mm256_andnot_ps ( signBit , v1 ) ) ;
maxAbs = _mm256_max_ps ( maxAbs , _mm256_andnot_ps ( signBit , v2 ) ) ;
maxAbs = _mm256_max_ps ( maxAbs , _mm256_andnot_ps ( signBit , v3 ) ) ;
// Compute max for the block
__m256 max = _mm256_max_ps ( v0 , v1 ) ;
__m256 maxTmp = _mm256_max_ps ( v2 , v3 ) ;
max = _mm256_max_ps ( max , maxTmp ) ;
__m128 max4 = _mm_max_ps ( _mm256_extractf128_ps ( max Abs , 1 ) , _mm256_castps256_ps128 ( max Abs ) ) ;
__m128 max4 = _mm_max_ps ( _mm256_extractf128_ps ( max , 1 ) , _mm256_castps256_ps128 ( max ) ) ;
max4 = _mm_max_ps ( max4 , _mm_movehl_ps ( max4 , max4 ) ) ;
max4 = _mm_max_ss ( max4 , _mm_movehdup_ps ( max4 ) ) ;
const float maxScalar = _mm_cvtss_f32 ( max4 ) ;
// Compute min for the block
__m256 min = _mm256_min_ps ( v0 , v1 ) ;
__m256 minTmp = _mm256_min_ps ( v2 , v3 ) ;
min = _mm256_min_ps ( min , minTmp ) ;
__m128 min4 = _mm_min_ps ( _mm256_extractf128_ps ( min , 1 ) , _mm256_castps256_ps128 ( min ) ) ;
min4 = _mm_min_ps ( min4 , _mm_movehl_ps ( min4 , min4 ) ) ;
min4 = _mm_min_ss ( min4 , _mm_movehdup_ps ( min4 ) ) ;
const float minScalar = _mm_cvtss_f32 ( min4 ) ;
// Quantize these floats
const float d = maxScalar / 7.0f ;
const float magnitude = maxScalar > = fabsf ( minScalar ) ? maxScalar : minScalar ;
const float d = magnitude / - 8.0f ;
y [ i ] . d = d ;
const float id = ( maxScalar ! = 0.0f ) ? 7.0f / maxScalar : 0.0f ;
const float id = ( ma gnitude ! = 0.0f ) ? - 8.0f / magnitude : 0.0f ;
const __m256 mul = _mm256_set1_ps ( id ) ;
// Apply the multiplier
@ -838,9 +872,11 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
const __m256i perm = _mm256_setr_epi32 ( 0 , 4 , 1 , 5 , 2 , 6 , 3 , 7 ) ;
i0 = _mm256_permutevar8x32_epi32 ( i0 , perm ) ;
// Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
// Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
const __m256i off = _mm256_set1_epi8 ( 8 ) ;
i0 = _mm256_add_epi8 ( i0 , off ) ;
const __m256i maxNibble = _mm256_set1_epi8 ( 15 ) ;
i0 = _mm256_min_epi8 ( i0 , maxNibble ) ;
// Compress the vector into 4 bit/value, and store
__m128i res = packNibbles ( i0 ) ;
@ -855,22 +891,31 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
__m256 v3 = _mm256_loadu_ps ( x + 24 ) ;
x + = 32 ;
// Compute max(abs(e)) for the block
const __m256 signBit = _mm256_set1_ps ( - 0.0f ) ;
__m256 maxAbs = _mm256_andnot_ps ( signBit , v0 ) ;
maxAbs = _mm256_max_ps ( maxAbs , _mm256_andnot_ps ( signBit , v1 ) ) ;
maxAbs = _mm256_max_ps ( maxAbs , _mm256_andnot_ps ( signBit , v2 ) ) ;
maxAbs = _mm256_max_ps ( maxAbs , _mm256_andnot_ps ( signBit , v3 ) ) ;
// Compute max for the block
__m256 max = _mm256_max_ps ( v0 , v1 ) ;
__m256 maxTmp = _mm256_max_ps ( v2 , v3 ) ;
max = _mm256_max_ps ( max , maxTmp ) ;
__m128 max4 = _mm_max_ps ( _mm256_extractf128_ps ( max Abs , 1 ) , _mm256_castps256_ps128 ( max Abs ) ) ;
__m128 max4 = _mm_max_ps ( _mm256_extractf128_ps ( max , 1 ) , _mm256_castps256_ps128 ( max ) ) ;
max4 = _mm_max_ps ( max4 , _mm_movehl_ps ( max4 , max4 ) ) ;
max4 = _mm_max_ss ( max4 , _mm_movehdup_ps ( max4 ) ) ;
const float maxScalar = _mm_cvtss_f32 ( max4 ) ;
// Compute min for the block
__m256 min = _mm256_min_ps ( v0 , v1 ) ;
__m256 minTmp = _mm256_min_ps ( v2 , v3 ) ;
min = _mm256_min_ps ( min , minTmp ) ;
__m128 min4 = _mm_min_ps ( _mm256_extractf128_ps ( min , 1 ) , _mm256_castps256_ps128 ( min ) ) ;
min4 = _mm_min_ps ( min4 , _mm_movehl_ps ( min4 , min4 ) ) ;
min4 = _mm_min_ss ( min4 , _mm_movehdup_ps ( min4 ) ) ;
const float minScalar = _mm_cvtss_f32 ( min4 ) ;
// Quantize these floats
const float d = maxScalar / 7.0f ;
const float magnitude = maxScalar > = fabsf ( minScalar ) ? maxScalar : minScalar ;
const float d = magnitude / - 8.0f ;
y [ i ] . d = d ;
const float id = ( maxScalar ! = 0.0f ) ? 7.0f / maxScalar : 0.0f ;
const float id = ( ma gnitude ! = 0.0f ) ? - 8.0f / magnitude : 0.0f ;
const __m256 mul = _mm256_set1_ps ( id ) ;
// Apply the multiplier
@ -911,10 +956,13 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
ni0 = _mm_packs_epi16 ( ni0 , ni2 ) ;
ni4 = _mm_packs_epi16 ( ni4 , ni6 ) ;
// Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
const __m128i off = _mm_set1_epi8 ( 8 ) ;
// Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
const __m128i off = _mm_set1_epi8 ( 8 ) ;
ni0 = _mm_add_epi8 ( ni0 , off ) ;
ni4 = _mm_add_epi8 ( ni4 , off ) ;
const __m128i maxNibble = _mm_set1_epi8 ( 15 ) ;
ni0 = _mm_min_epi8 ( ni0 , maxNibble ) ;
ni4 = _mm_min_epi8 ( ni4 , maxNibble ) ;
// Compress the vector into 4 bit/value, and store
__m128i res = packNibbles ( ni0 , ni4 ) ;
@ -922,24 +970,32 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
}
# elif defined(__wasm_simd128__)
for ( int i = 0 ; i < nb ; i + + ) {
float amax = 0.0f ; // absolute max
float max = 0.0f ;
float min = 0.0f ;
v128_t srcv [ 8 ] ;
v128_t asrc v[ 8 ] ;
v128_t amax v[ 8 ] ;
v128_t max v[ 8 ] ;
v128_t min v[ 8 ] ;
for ( int l = 0 ; l < 8 ; l + + ) srcv [ l ] = wasm_v128_load ( x + i * 32 + 4 * l ) ;
for ( int l = 0 ; l < 8 ; l + + ) asrcv [ l ] = wasm_f32x4_abs ( srcv [ l ] ) ;
for ( int l = 0 ; l < 4 ; l + + ) a maxv[ 2 * l ] = wasm_f32x4_max ( a srcv[ 2 * l ] , a srcv[ 2 * l + 1 ] ) ;
for ( int l = 0 ; l < 2 ; l + + ) a maxv[ 4 * l ] = wasm_f32x4_max ( a maxv[ 4 * l ] , a maxv[ 4 * l + 2 ] ) ;
for ( int l = 0 ; l < 1 ; l + + ) a maxv[ 8 * l ] = wasm_f32x4_max ( a maxv[ 8 * l ] , a maxv[ 8 * l + 4 ] ) ;
for ( int l = 0 ; l < 4 ; l + + ) maxv[ 2 * l ] = wasm_f32x4_max ( srcv[ 2 * l ] , srcv[ 2 * l + 1 ] ) ;
for ( int l = 0 ; l < 2 ; l + + ) maxv[ 4 * l ] = wasm_f32x4_max ( maxv[ 4 * l ] , maxv[ 4 * l + 2 ] ) ;
for ( int l = 0 ; l < 1 ; l + + ) maxv[ 8 * l ] = wasm_f32x4_max ( maxv[ 8 * l ] , maxv[ 8 * l + 4 ] ) ;
amax = MAX (
MAX ( wasm_f32x4_extract_lane ( amaxv [ 0 ] , 0 ) , wasm_f32x4_extract_lane ( amaxv [ 0 ] , 1 ) ) ,
MAX ( wasm_f32x4_extract_lane ( amaxv [ 0 ] , 2 ) , wasm_f32x4_extract_lane ( amaxv [ 0 ] , 3 ) ) ) ;
for ( int l = 0 ; l < 4 ; l + + ) minv [ 2 * l ] = wasm_f32x4_min ( srcv [ 2 * l ] , srcv [ 2 * l + 1 ] ) ;
for ( int l = 0 ; l < 2 ; l + + ) minv [ 4 * l ] = wasm_f32x4_min ( minv [ 4 * l ] , minv [ 4 * l + 2 ] ) ;
for ( int l = 0 ; l < 1 ; l + + ) minv [ 8 * l ] = wasm_f32x4_min ( minv [ 8 * l ] , minv [ 8 * l + 4 ] ) ;
const float d = amax / ( ( 1 < < 3 ) - 1 ) ;
max = MAX (
MAX ( wasm_f32x4_extract_lane ( maxv [ 0 ] , 0 ) , wasm_f32x4_extract_lane ( maxv [ 0 ] , 1 ) ) ,
MAX ( wasm_f32x4_extract_lane ( maxv [ 0 ] , 2 ) , wasm_f32x4_extract_lane ( maxv [ 0 ] , 3 ) ) ) ;
min = MIN (
MIN ( wasm_f32x4_extract_lane ( minv [ 0 ] , 0 ) , wasm_f32x4_extract_lane ( minv [ 0 ] , 1 ) ) ,
MIN ( wasm_f32x4_extract_lane ( minv [ 0 ] , 2 ) , wasm_f32x4_extract_lane ( minv [ 0 ] , 3 ) ) ) ;
const float magnitude = max > = fabsf ( min ) ? max : min ;
const float d = magnitude / - 8 ;
const float id = d ? 1.0 / d : 0.0 ;
y [ i ] . d = d ;
@ -948,9 +1004,10 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
const v128_t v = wasm_f32x4_mul ( srcv [ l ] , wasm_f32x4_splat ( id ) ) ;
const v128_t vf = wasm_f32x4_add ( v , wasm_f32x4_splat ( 8.5f ) ) ;
const v128_t vi = wasm_i32x4_trunc_sat_f32x4 ( vf ) ;
const v128_t vc = wasm_i32x4_min_u ( vi , wasm_i32x4_splat ( 15 ) ) ;
y [ i ] . qs [ 2 * l + 0 ] = wasm_i32x4_extract_lane ( v i, 0 ) | ( wasm_i32x4_extract_lane ( vi , 1 ) < < 4 ) ;
y [ i ] . qs [ 2 * l + 1 ] = wasm_i32x4_extract_lane ( v i, 2 ) | ( wasm_i32x4_extract_lane ( vi , 3 ) < < 4 ) ;
y [ i ] . qs [ 2 * l + 0 ] = wasm_i32x4_extract_lane ( v c, 0 ) | ( wasm_i32x4_extract_lane ( vc , 1 ) < < 4 ) ;
y [ i ] . qs [ 2 * l + 1 ] = wasm_i32x4_extract_lane ( v c, 2 ) | ( wasm_i32x4_extract_lane ( vc , 3 ) < < 4 ) ;
}
}
# else
@ -1131,13 +1188,17 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
for ( int i = 0 ; i < nb ; i + + ) {
float amax = 0.0f ; // absolute max
float max = 0.0f ;
for ( int l = 0 ; l < QK4_2 ; l + + ) {
const float v = x [ i * QK4_2 + l ] ;
amax = MAX ( amax , fabsf ( v ) ) ;
if ( amax < fabsf ( v ) ) {
amax = fabsf ( v ) ;
max = v ;
}
}
const float d = amax / ( ( 1 < < 3 ) - 1 ) ;
const float d = max / - 8 ;
const float id = d ? 1.0f / d : 0.0f ;
@ -1147,8 +1208,8 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
const float v0 = x [ i * QK4_2 + l + 0 ] * id ;
const float v1 = x [ i * QK4_2 + l + 1 ] * id ;
const uint8_t vi0 = ( uint8_t ) ( v0 + 8.5f ) ;
const uint8_t vi1 = ( uint8_t ) ( v1 + 8.5f ) ;
const uint8_t vi0 = MIN ( 15 , ( int8_t ) roundf ( v0 ) + 8 ) ;
const uint8_t vi1 = MIN ( 15 , ( int8_t ) roundf ( v1 ) + 8 ) ;
assert ( vi0 < 16 ) ;
assert ( vi1 < 16 ) ;
@ -1242,9 +1303,9 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
block_q4_2 * restrict y = vy ;
//quantize_row_q4_2_reference(x, y, k);
quantize_row_q4_2_reference ( x , y , k ) ;
// This produces the exact same format, just better match to the input floats ("better" as measured by RMSE)
quantize_row_q4_2_rmse ( x , y , k ) ;
//quantize_row_q4_2_rmse(x, y, k);
}
static void quantize_row_q4_3_reference ( const float * restrict x , block_q4_3 * restrict y , int k ) {
@ -1795,7 +1856,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
[ GGML_TYPE_Q4_2 ] = {
. dequantize_row_q = dequantize_row_q4_2 ,
. quantize_row_q = quantize_row_q4_2 ,
. quantize_row_q_reference = ( quantize_row_q_t ) quantize_row_q4_2_r mse, //quantize_row_q4_2_r eference,
. quantize_row_q_reference = ( quantize_row_q_t ) quantize_row_q4_2_r eference ,
. quantize_row_q_dot = quantize_row_q8_0 ,
. vec_dot_q = ggml_vec_dot_q4_2_q8_0 ,
} ,
@ -12127,8 +12188,8 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
for ( int j = 0 ; j < n ; j + = k ) {
block_q4_2 * restrict y = ( block_q4_2 * ) dst + j / QK4_2 ;
//quantize_row_q4_2_reference(src + j, y, k);
quantize_row_q4_2_rmse ( src + j , y , k ) ;
quantize_row_q4_2_reference ( src + j , y , k ) ;
//quantize_row_q4_2_rmse(src + j, y, k);
for ( int i = 0 ; i < nb ; i + + ) {
for ( int l = 0 ; l < QK4_2 ; l + = 2 ) {