diff --git a/ggml.c b/ggml.c index a817f83..6db6fde 100644 --- a/ggml.c +++ b/ggml.c @@ -1944,7 +1944,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); - /* Prepare the constants we will need during execution */ + /* Prepare the constants we will need during execution */ const __m256i lowMask = _mm256_set1_epi8( 0xF ); const __m256i offset_8 = _mm256_set1_epi16( 8 ); @@ -1954,61 +1954,59 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest // Main loop for (int i = 0; i < nb; i+=UNROLL_COUNT) { - - // This loop will be unrolled by the compiler + // This loop will be unrolled by the compiler for (int u=0;u we now have a vector of 8 int_32t */ - __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q ); + /* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */ + __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q ); - /* Convert to vectore of 8 int32_t to 8 floats */ - __m256 q = _mm256_cvtepi32_ps( xy_q ); + /* Convert to vectore of 8 int32_t to 8 floats */ + __m256 q = _mm256_cvtepi32_ps( xy_q ); - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( scale, q, acc ); + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( scale, q, acc ); } - - } + } // Return horizontal sum of the acc vector __m128 res = _mm256_extractf128_ps( acc, 1 );