/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <smmintrin.h>
#include <stdint.h>

#include "./av1_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"

// Coefficient quantization phase 1
// param[0-2] : rounding/quan/dequan constants
static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
                                         const int shift, const int scale,
                                         __m128i *qcoeff, __m128i *dquan,
                                         __m128i *sign) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i one = _mm_set1_epi32(1);

  *sign = _mm_cmplt_epi32(*coeff, zero);
  *sign = _mm_or_si128(*sign, one);
  *coeff = _mm_abs_epi32(*coeff);

  qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
  qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
  qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);

  qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
  qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
  dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
  dquan[0] = _mm_srli_epi64(dquan[0], scale);
}

// Coefficient quantization phase 2
static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
                                         const __m128i *sign,
                                         const __m128i *param, const int shift,
                                         const int scale, tran_low_t *qAddr,
                                         tran_low_t *dqAddr) {
  __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
  __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);

  qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
  qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
  dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
  dquan[1] = _mm_srli_epi64(dquan[1], scale);

  // combine L&H
  qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
  qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);

  qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
  qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);

  dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
  dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);

  dquan[0] = _mm_and_si128(dquan[0], mask0H);
  dquan[1] = _mm_and_si128(dquan[1], mask0L);

  qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
  dquan[0] = _mm_or_si128(dquan[0], dquan[1]);

  qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
  dquan[0] = _mm_sign_epi32(dquan[0], *sign);

  _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
  _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
}

static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
                            __m128i *eob) {
  const __m128i zero = _mm_setzero_si128();
  __m128i mask, iscanIdx;
  const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
  const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
  __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
  __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);

  nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
  nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);

  mask = _mm_packs_epi32(nz_flag0, nz_flag1);
  iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
  iscanIdx = _mm_sub_epi16(iscanIdx, mask);
  iscanIdx = _mm_and_si128(iscanIdx, mask);
  *eob = _mm_max_epi16(*eob, iscanIdx);
}

static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
  __m128i eob_shuffled;
  uint16_t eobValue;
  eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
  *eob = _mm_max_epi16(*eob, eob_shuffled);
  eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
  *eob = _mm_max_epi16(*eob, eob_shuffled);
  eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
  *eob = _mm_max_epi16(*eob, eob_shuffled);
  eobValue = _mm_extract_epi16(*eob, 0);
  return eobValue;
}

void av1_highbd_quantize_fp_sse4_1(
    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
    const int16_t *scan, const int16_t *iscan, int log_scale) {
  __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
  __m128i eob = _mm_setzero_si128();
  const tran_low_t *src = coeff_ptr;
  tran_low_t *quanAddr = qcoeff_ptr;
  tran_low_t *dquanAddr = dqcoeff_ptr;
  const int shift = 16 - log_scale;
  const int coeff_stride = 4;
  const int quan_stride = coeff_stride;
  (void)skip_block;
  (void)zbin_ptr;
  (void)quant_shift_ptr;
  (void)scan;

  memset(quanAddr, 0, count * sizeof(quanAddr[0]));
  memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));

  if (!skip_block) {
    coeff[0] = _mm_loadu_si128((__m128i const *)src);

    qparam[0] =
        _mm_set_epi32(round_ptr[1] >> log_scale, round_ptr[1] >> log_scale,
                      round_ptr[1] >> log_scale, round_ptr[0] >> log_scale);
    qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[0]);
    qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[0]);

    // DC and first 3 AC
    quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
                          &coeff_sign);

    // update round/quan/dquan for AC
    qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
    qparam[1] = _mm_set_epi32(0, quant_ptr[1], 0, quant_ptr[1]);
    qparam[2] = _mm_set_epi32(0, dequant_ptr[1], 0, dequant_ptr[1]);

    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
                          log_scale, quanAddr, dquanAddr);

    // next 4 AC
    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
    quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
                          &coeff_sign);
    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
                          log_scale, quanAddr + quan_stride,
                          dquanAddr + quan_stride);

    find_eob(quanAddr, iscan, &eob);

    count -= 8;

    // loop for the rest of AC
    while (count > 0) {
      src += coeff_stride << 1;
      quanAddr += quan_stride << 1;
      dquanAddr += quan_stride << 1;
      iscan += quan_stride << 1;

      coeff[0] = _mm_loadu_si128((__m128i const *)src);
      coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));

      quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff,
                            dequant, &coeff_sign);
      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
                            log_scale, quanAddr, dquanAddr);

      quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff,
                            dequant, &coeff_sign);
      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
                            log_scale, quanAddr + quan_stride,
                            dquanAddr + quan_stride);

      find_eob(quanAddr, iscan, &eob);

      count -= 8;
    }
    *eob_ptr = get_accumulated_eob(&eob);
  } else {
    *eob_ptr = 0;
  }
}
