/****************************************************************************
**
** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com
** Contact: https://www.qt.io/licensing/
**
** This file is part of the QtGui module of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and The Qt Company. For licensing terms
** and conditions see https://www.qt.io/terms-conditions. For further
** information use the contact form at https://www.qt.io/contact-us.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 3 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL3 included in the
** packaging of this file. Please review the following information to
** ensure the GNU Lesser General Public License version 3 requirements
** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU
** General Public License version 2.0 or (at your option) the GNU General
** Public license version 3 or any later version approved by the KDE Free
** Qt Foundation. The licenses are as published by the Free Software
** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
** included in the packaging of this file. Please review the following
** information to ensure the GNU General Public License requirements will
** be met: https://www.gnu.org/licenses/gpl-2.0.html and
** https://www.gnu.org/licenses/gpl-3.0.html.
**
** $QT_END_LICENSE$
**
****************************************************************************/

#include "qt_mips_asm_dsp_p.h"

LEAF_MIPS_DSP(destfetchARGB32_asm_mips_dsp)
/*
 * a0 - buffer address (dst)
 * a1 - data address (src)
 * a2 - length
 */

    beqz              a2, 2f
     move             v0, a0         /* just return the address of buffer
                                      * for storing returning values */
    move              v0, a0
    andi              t1, a2, 0x1
    li                t7, 8388736    /* t7 = 0x800080 */
    beqz              t1, 1f
     nop
    lw                t8, 0(a1)
    addiu             a2, a2, -1
    srl               t6, t8, 24     /* t6 = alpha */

    preceu.ph.qbra    t0, t8
    mul               t1, t0, t6
    preceu.ph.qbla    t4, t8
    mul               t5, t4, t6

    preceu.ph.qbla    t2, t1
    addq.ph           t3, t1, t2
    addq.ph           t3, t3, t7
    preceu.ph.qbla    t1, t3         /* t1 holds R & B blended with alpha
                                      * | 0 | dRab | 0 | dBab | */
    preceu.ph.qbla    t2, t5
    addq.ph           t3, t2, t5
    addq.ph           t4, t3, t7
    preceu.ph.qbla    t2, t4         /* t2 holds A & G blended with alpha
                                      * | 0 | dAab | 0 | dGab | */
    andi              t2, t2, 255    /* t2 = 0xff */

    sll               t0, t6, 24
    sll               t3, t2, 8
    or                t4, t0, t3
    or                t0, t1, t4
    sw                t0, 0(a0)
    addiu             a0, a0, 4
    addiu             a1, a1, 4
    beqz              a2, 2f         /* there was only one member */
     nop
1:
    lw                t0, 0(a1)      /* t0 = src1 */
    lw                t1, 4(a1)      /* t1 = src2 */
    precrq.qb.ph      t4, t0, t1     /* t4 = a1 G1 a2 G2 */
    preceu.ph.qbra    t3, t4         /* t3 = 0 G1 0 G2 */
    preceu.ph.qbla    t2, t4         /* t2 = | 0 | a1 | 0 | a2 | */
    srl               t5, t2, 8
    or                t8, t2, t5     /* t8 = 0 a1 a1 a2 */
    muleu_s.ph.qbr    t5, t8, t3

    addiu             a2, a2, -2
    addiu             a1, a1, 8
    precrq.ph.w       t9, t0, t1
    preceu.ph.qbra    t9, t9

    preceu.ph.qbla    t6, t5
    addq.ph           t5, t5, t6
    addq.ph           t2, t5, t7
    muleu_s.ph.qbr    t6, t8, t9
    sll               t3, t1, 16
    packrl.ph         t3, t0, t3
    preceu.ph.qbra    t3, t3
    muleu_s.ph.qbr    t8, t8, t3
    preceu.ph.qbla    t3, t6
    addq.ph           t3, t6, t3
    addq.ph           t3, t3, t7
    preceu.ph.qbla    t5, t8
    addq.ph           t5, t8, t5
    addq.ph           t5, t5, t7

    precrq.ph.w       t0, t4, t3     /* t0 = | 0 |  a1 | 0 | dR1 | */
    precrq.ph.w       t1, t2, t5     /* t1 = | 0 | dG1 | 0 | dB1 | */
    precrq.qb.ph      t6, t0, t1     /* t6 = | a1 | dR1 | dG1 | dB1 | */
    sll               t3, t3, 16
    sll               t5, t5, 16
    packrl.ph         t0, t4, t3
    packrl.ph         t1, t2, t5
    precrq.qb.ph      t8, t0, t1     /* t8 = | a2 | dR2 | dG2 | dB2 | */
    sw                t6, 0(a0)
    sw                t8, 4(a0)
    bnez              a2, 1b
     addiu            a0, a0, 8
2:
    j                 ra
     nop

END(destfetchARGB32_asm_mips_dsp)

LEAF_MIPS_DSP(qt_memfill32_asm_mips_dsp)
/*
 * a0 - destination address (dst)
 * a1 - value
 * a2 - count
 */

    beqz      a2, 5f
     nop
    li        t8, 8
    andi      t0, a2, 0x7    /* t0 holds how many counts exceeds 8 */
    beqzl     t0, 2f         /* count is multiple of 8 (8, 16, 24, ....) */
     addiu    a2, a2, -8
    subu      a2, a2, t0
1:
    sw        a1, 0(a0)
    addiu     t0, t0, -1
    bnez      t0, 1b
     addiu    a0, a0, 4
    bgeu      a2, t8, 2f
     addiu    a2, a2, -8
    b         5f
     nop
2:
    beqz      a2, 4f
     nop
3:
    pref      30, 32(a0)
    addiu     a2, a2, -8
    sw        a1, 0( a0)
    sw        a1, 4(a0)
    sw        a1, 8(a0)
    sw        a1, 12(a0)
    addiu     a0, a0, 32
    sw        a1, -16(a0)
    sw        a1, -12(a0)
    sw        a1, -8(a0)
    bnez      a2, 3b
     sw       a1, -4(a0)
4:
    sw        a1, 0(a0)
    sw        a1, 4(a0)
    sw        a1, 8(a0)
    sw        a1, 12(a0)
    addiu     a0, a0, 32
    sw        a1, -16(a0)
    sw        a1, -12(a0)
    sw        a1, -8(a0)
    sw        a1, -4(a0)
5:
    jr        ra
     nop

END(qt_memfill32_asm_mips_dsp)

LEAF_MIPS_DSP(comp_func_SourceOver_asm_mips_dsp)
/*
 * a0 - uint *dest
 * a1 - const uint *src
 * a2 - int length
 * a3 - uint const_alpha
 */

    beqz              a2, 5f
     nop
    li                t8, 0xff
    li                t7, 8388736    /* t7 = 0x800080 */
    bne               a3, t8, 4f
     nop

/* part where const_alpha = 255 */
    b                 2f
     nop
1:
    addiu             a0, a0, 4
    addiu             a2, a2, -1
    beqz              a2, 5f
     nop
2:
    lw                t0, 0(a1)      /* t0 = s = src[i] */
    addiu             a1, a1, 4
    nor               t1, t0, zero
    srl               t1, t1, 24     /* t1 = ~qAlpha(s) */
    bnez              t1, 3f
     nop
    sw                t0, 0(a0)      /* dst[i] = src[i] */
    addiu             a2, a2, -1
    bnez              a2, 2b
     addiu            a0, a0, 4
    b 5f
     nop
3:
    beqz              t0, 1b
     nop

    lw                t4, 0(a0)
    replv.ph          t6, t1
    muleu_s.ph.qbl    t2, t4, t6
    muleu_s.ph.qbr    t3, t4, t6
    addiu             a2, a2, -1
    preceu.ph.qbla    t4, t2
    addq.ph           t4, t2, t4
    addq.ph           t4, t4, t7
    preceu.ph.qbla    t5, t3
    addq.ph           t5, t5, t3
    addq.ph           t5, t5, t7
    precrq.qb.ph      t8, t4, t5    /* t8 = | dsA | dsR | dsG | dsB | */
    addu              t8, t0, t8    /* dst[i] =
                                     * s + BYTE_MUL(dst[i],~qAlpha(s)) */
    sw                t8, 0(a0)
    bnez              a2, 2b
     addiu            a0, a0, 4
    b                 5f
     nop
4:
    lw                t0, 0(a0)     /* t0 - dst[i] "1" */
    lw                t1, 0(a1)     /* t1 - src[i] "2" */
    addiu             a1, a1, 4
    addiu             a2, a2, -1
    replv.ph          t6, a3        /* a1 = 0x00a00a */
    muleu_s.ph.qbl    t2, t1, t6
    muleu_s.ph.qbr    t3, t1, t6
    preceu.ph.qbla    t4, t2
    addq.ph           t4, t2, t4
    addq.ph           t4, t4, t7
    preceu.ph.qbla    t5, t3
    addq.ph           t5, t5, t3
    addq.ph           t5, t5, t7
    precrq.qb.ph      t8, t4, t5    /* t8 = | dsA | dsR | dsG | dsB | */

    nor               t6, t8, zero
    srl               t6, t6, 24
    replv.ph          t6, t6

    muleu_s.ph.qbl    t2, t0, t6
    muleu_s.ph.qbr    t3, t0, t6
    preceu.ph.qbla    t4, t2
    addq.ph           t4, t2, t4
    addq.ph           t4, t4, t7
    preceu.ph.qbla    t5, t3
    addq.ph           t5, t5, t3
    addq.ph           t5, t5, t7
    precrq.qb.ph      t6, t4, t5    /* t6 = | ddA | ddR | ddG | ddB | */

    addu              t0, t8, t6
    sw                t0, 0(a0)
    bnez              a2, 4b
     addiu            a0, a0, 4
5:
    jr                ra
     nop

END(comp_func_SourceOver_asm_mips_dsp)

LEAF_MIPS_DSPR2(qt_destStoreARGB32_asm_mips_dsp)
/*
 * a0 - uint * data
 * a1 - const uint *buffer
 * a2 - int length
 */

    blez      a2, 6f
    move      v1, zero
    li        t0, 255
    lui       a3, 0xff
    j         2f
     lui      t2, 0xff00
1:
    addiu     v1, v1, 1
    sw        zero, 0(a0)
    addiu     a1, a1, 4
    beq       v1, a2, 6f
    addiu     a0, a0, 4
2:
    lw        v0, 0(a1)
    srl       t3, v0, 0x18
    beql      t3, t0, 5f
    addiu     v1, v1, 1
    beqz      t3, 1b

    srl       t1, v0, 0x8
    andi      t1, t1, 0xff

    teq       t3, zero, 0x7
    div       zero, a3, t3
    move      t8, t3
    andi      t6, v0, 0xff

    srl       t3,v0,0x10
    andi      t3,t3,0xff

    and       t5, v0, t2
    mflo      t4

    mult      $ac0, t4, t6
    mult      $ac1, t1, t4
    mul       t4, t3, t4

    sltiu     t8, t8, 2
    beqz      t8, 3f
     nop
    mflo      t6, $ac0
    mflo      t1, $ac1
    sra       t6, t6, 0x10
    sra       t1, t1, 0x8
    b         4f
     nop
3:
    extr.w    t6, $ac0, 0x10
    extr.w    t1, $ac1, 0x8
4:
    and       v0, t4, a3
    or        v0, v0, t6
    or        v0, v0, t5
    andi      t1, t1, 0xff00
    or        v0, v0, t1
    addiu     v1, v1, 1
5:
    sw        v0, 0(a0)
    addiu     a1, a1, 4
    bne       v1, a2, 2b
    addiu     a0, a0, 4
6:
    jr        ra
     nop

END(qt_destStoreARGB32_asm_mips_dsp)

LEAF_MIPS_DSP(comp_func_solid_Source_dsp_asm_x2)
/*
 * a0 - const uint *dest
 * a1 - int length
 * a2 - uint color
 * a3 - uint ialpha
 */

    beqz              a1, 2f
     nop
    replv.ph          a3, a3
    li                t9, 8388736    /* t9 = 0x800080 */
1:
    lw                t0, 0(a0)
    lw                t1, 4(a0)
    or                t2, t0, t1    /* if both dest are zero, no computation needed */
    beqz              t2, 12f
     addiu             a1, -2

    BYTE_MUL_x2 t0, t1, t6, t7, a3, a3, t9, t2, t3, t4, t5, 0
11:
    addu              t2, a2, t6
    addu              t3, a2, t7
    sw                t2, 0(a0)
    sw                t3, 4(a0)
    bnez              a1, 1b
     addiu             a0, 8
    b                 2f
12:
    addu              t2, a2, t0
    addu              t3, a2, t1
    sw                t2, 0(a0)
    sw                t3, 4(a0)
    bnez              a1, 1b
     addiu             a0, 8
2:
    jr                ra
     nop

END(comp_func_solid_Source_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_solid_DestinationOver_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - int length
 * a2 - uint color
 */

    addiu             sp, sp, -8
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    beqz              a1, 2f
     nop
    beqz              a2, 2f
     nop
    li                t9, 8388736    /* t4 = 0x800080 */

1:
    lw                t0, 0(a0)
    lw                t1, 4(a0)
    not               t2, t0
    not               t3, t1
    srl               t4, t2, 24
    srl               t5, t3, 24
    or                t2, t4, t5    /* if both dest are zero, no computation needed */
    beqz              t2, 11f
     addiu             a1, -2
    replv.ph          t2, t4
    replv.ph          t3, t5

    BYTE_MUL_x2 a2, a2, t8, a3, t2, t3, t9, t4, t5, t6, t7

    addu              t0, t0, t8
    addu              t1, t1, a3
11:
    sw                t0, 0(a0)
    sw                t1, 4(a0)
    bnez              a1, 1b
     addiu             a0, 8

2:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    addiu             sp, sp, 8
    jr                ra
     nop

END(comp_func_solid_DestinationOver_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_DestinationOver_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - uint *src
 * a2 - int length
 * a3 - uint const_alpha
 */

    .set              noat
    addiu             sp, sp, -8
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    beqz              a2, 3f
     nop
    li                t9, 8388736    /* t4 = 0x800080 */
    li                t0, 0xff
    beq               a3, t0, 2f
     nop

/* part where const_alpha != 255 */
1:
    replv.ph          a3, a3
11:
    lw                t0, 0(a1)     # src_1
    lw                t1, 4(a1)     # src_2
    addiu             a2, -2

    BYTE_MUL_x2 t0, t1, t8, AT, a3, a3, t9, t4, t5, t6, t7, 0
                                    # t8 = s1
                                    # AT = s2
    lw                t0, 0(a0)     # dest_1
    lw                t1, 4(a0)     # dest_2
    addiu             a1, 8
    not               t2, t0
    not               t3, t1
    srl               t4, t2, 24
    srl               t5, t3, 24
    replv.ph          t2, t4        # qAlpha(~d) 1
    replv.ph          t3, t5        # qAlpha(~d) 2

    BYTE_MUL_x2 t8, AT, s0, s1, t2, t3, t9, t4, t5, t6, t7

    addu              t0, t0, s0
    addu              t1, t1, s1
    sw                t0, 0(a0)
    sw                t1, 4(a0)
    bnez              a2, 11b
     addiu             a0, 8
    b                 3f
     nop

/* part where const_alpha = 255 */
2:
    lw                t0, 0(a0)     # dest 1
    lw                t1, 4(a0)     # dest 2
    lw                s0, 0(a1)     # src 1
    lw                s1, 4(a1)     # src 2
    not               t2, t0
    not               t3, t1
    srl               t4, t2, 24
    srl               t5, t3, 24
    replv.ph          t2, t4
    replv.ph          t3, t5
    addiu             a1, 8
    addiu             a2, -2

    BYTE_MUL_x2 s0, s1, t8, AT, t2, t3, t9, t4, t5, t6, t7

    addu              t0, t0, t8
    addu              t1, t1, AT
    sw                t0, 0(a0)
    sw                t1, 4(a0)
    bnez              a2, 2b
     addiu             a0, 8

3:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    addiu             sp, sp, 8
    jr                ra
     nop
    .set              at

END(comp_func_DestinationOver_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_solid_SourceIn_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - int length
 * a2 - uint color
 * a3 - uint const_alpha
 */

    .set              noat
    addiu             sp, -12
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    sw                s2, 8(sp)
    beqz              a1, 3f
     nop
    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
    lui               t8, 0xff00
    li                t0, 0xff
    beq               a3, t0, 2f
     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */

/* part where const_alpha != 255 */
1:
    replv.ph          t0, a3
    li                t5, 0xff
    BYTE_MUL a2, a2, t0, t9, t1, t2, t3, t4    /* a2 = color ( = BYTE_MUL(color, const_alpha)); */
    subu              t1, t5, a3               /* t1 = cia = 255 - const_alpha */
11:
    lw                t2, 0(a0)                /* t2 = d */
    lw                s0, 4(a0)
    addiu             a1, -2
    srl               t3, t2, 24               /* t3 = qAlpha(d) */
    srl               s2, s0, 24

    INTERPOLATE_PIXEL_255 a2, t3, t2, t1, AT, t9, t8, t4, t5, t6, t7
    INTERPOLATE_PIXEL_255 a2, s2, s0, t1, s1, t9, t8, t4, t5, t6, t7

    sw                AT, 0(a0)
    sw                s1, 4(a0)
    bnez              a1, 11b
     addiu            a0, 8
    b                 3f
     nop

/* part where const_alpha = 255 */
2:
    lw                t0, 0(a0)                /* dest 1 */
    lw                t1, 4(a0)                /* dest 2 */
    srl               t4, t0, 24
    srl               t5, t1, 24
    replv.ph          t2, t4
    replv.ph          t3, t5
    addiu             a1, -2

    BYTE_MUL_x2 a2, a2, t8, AT, t2, t3, t9, t4, t5, t6, t7

    sw                t8, 0(a0)
    sw                AT, 4(a0)
    bnez              a1, 2b
     addiu             a0, 8

3:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    lw                s2, 8(sp)
    addiu             sp, 12
    jr                ra
     nop
    .set              at

END(comp_func_solid_SourceIn_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_SourceIn_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - const uint *src
 * a2 - int length
 * a3 - uint const_alpha
 */

    .set              noat
    addiu             sp, -16
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    sw                s2, 8(sp)
    sw                s3, 12(sp)
    beqz              a2, 3f
     nop
    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
    lui               t8, 0xff00
    li                t0, 0xff
    beq               a3, t0, 2f
     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */

/* part where const_alpha != 255 */
1:
    li                t5, 0xff
    subu              t7, t5, a3               /* t7 = cia = 255 - const_alpha */
    replv.ph          a3, a3
11:
    lw                t0, 0(a1)                /* t0 = src 1 */
    lw                t1, 4(a1)                /* t1 = src 2 */
    addiu             a2, -2

    BYTE_MUL_x2 t0, t1, AT, s0, a3, a3, t9, t3, t4, t5, t6, 0

    lw                t0, 0(a0)                /* t0 = dest 1 */
    lw                t1, 4(a0)                /* t1 = dest 2 */
    addiu             a1, 8

    srl               t2, t0, 24               /* t2 = qAlpha(d) 1 */
    srl               t3, t1, 24               /* t3 = qAlpha(d) 2 */

    INTERPOLATE_PIXEL_255 AT, t2, t0, t7, s1, t9, t8, t4, t5, t6, s3
    INTERPOLATE_PIXEL_255 s0, t3, t1, t7, s2, t9, t8, t4, t5, t6, s3

    sw                s1, 0(a0)
    sw                s2, 4(a0)
    bnez              a2, 11b
     addiu            a0, 8
    b                 3f
     nop

/* part where const_alpha = 255 */
2:
    lw                t2, 0(a0)                /* dest 1 */
    lw                t3, 4(a0)                /* dest 2 */
    lw                t0, 0(a1)                /* src 1 */
    lw                t1, 4(a1)                /* src 2 */
    srl               t4, t2, 24
    srl               t5, t3, 24
    replv.ph          t2, t4
    replv.ph          t3, t5
    addiu             a2, -2

    BYTE_MUL_x2 t0, t1, t8, AT, t2, t3, t9, t4, t5, t6, t7

    addiu             a1, 8
    sw                t8, 0(a0)
    sw                AT, 4(a0)
    bnez              a2, 2b
     addiu             a0, 8

3:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    lw                s2, 8(sp)
    lw                s3, 12(sp)
    addiu             sp, 16
    jr                ra
     nop
    .set              at

END(comp_func_SourceIn_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_solid_DestinationIn_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - int length
 * a2 - uint a
 */

    .set              noat
    beqz              a1, 2f
     nop
    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
    replv.ph          a2, a2
1:
    lw                t0, 0(a0)
    lw                t1, 4(a0)
    addiu             a1, -2

    BYTE_MUL_x2 t0, t1, t8, AT, a2, a2, t9, t4, t5, t6, t7, 0

    sw                t8, 0(a0)
    sw                AT, 4(a0)
    bnez              a1, 1b
     addiu            a0, 8
2:
    jr                ra
     nop
    .set              at

END(comp_func_solid_DestinationIn_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_DestinationIn_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - const uint *src
 * a2 - int length
 * a3 - uint const_alpha
 */

    addiu             sp, -8
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    beqz              a2, 3f
     nop
    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
    li                t0, 0xff
    beq               a3, t0, 2f
     nop

/* part where const_alpha != 255 */
1:
    li                t5, 0xff
    subu              t8, t5, a3               /* t8 = cia = 255 - const_alpha */
    replv.ph          a3, a3
11:
    lw                t0, 0(a1)                /* t0 = src 1 */
    lw                t1, 4(a1)                /* t1 = src 2 */
    addiu             a2, -2
    srl               t0, t0, 24
    srl               t1, t1, 24

    BYTE_MUL_x2 t0, t1, s1, t7, a3, a3, t9, t3, t4, t5, t6, 0

    lw                t0, 0(a0)                /* t0 = dest 1 */
    lw                t1, 4(a0)                /* t1 = dest 2 */
    addu              s1, s1, t8               /* a 1 */
    addu              t7, t7, t8               /* a 2 */
    replv.ph          t2, s1
    replv.ph          t3, t7

    BYTE_MUL_x2 t0, t1, s1, t7, t2, t3, t9, t4, t5, t6, s0

    addiu             a1, 8
    sw                s1, 0(a0)
    sw                t7, 4(a0)
    bnez              a2, 11b
     addiu            a0, 8
    b                 3f
     nop

/* part where const_alpha = 255 */
2:
    lw                t2, 0(a1)                /* src 1 */
    lw                t3, 4(a1)                /* src 2 */
    lw                t0, 0(a0)                /* dest 1 */
    lw                t1, 4(a0)                /* dest 2 */
    srl               t4, t2, 24
    srl               t5, t3, 24
    replv.ph          t2, t4                   /* t2 = qAlpha(src 1) */
    replv.ph          t3, t5                   /* t3 = qAlpha(src 2) */
    addiu             a2, -2

    BYTE_MUL_x2 t0, t1, t8, s1, t2, t3, t9, t4, t5, t6, t7

    addiu             a1, 8
    sw                t8, 0(a0)
    sw                s1, 4(a0)
    bnez              a2, 2b
     addiu             a0, 8

3:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    addiu             sp, 8
    jr                ra
     nop

END(comp_func_DestinationIn_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_DestinationOut_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - const uint *src
 * a2 - int length
 * a3 - uint const_alpha
 */

    .set              noat
    addiu             sp, -4
    sw                s0, 0(sp)
    beqz              a2, 3f
     nop
    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
    li                t0, 0xff
    beq               a3, t0, 2f
     nop

/* part where const_alpha != 255 */
1:
    li                t5, 0xff
    subu              t8, t5, a3               /* t8 = cia = 255 - const_alpha */
    replv.ph          a3, a3
11:
    lw                t0, 0(a1)                /* t0 = src 1 */
    lw                t1, 4(a1)                /* t1 = src 2 */
    not               t0, t0
    not               t1, t1
    addiu             a2, -2
    srl               t0, t0, 24
    srl               t1, t1, 24

    BYTE_MUL_x2       t0, t1, AT, t7, a3, a3, t9, t3, t4, t5, t6, 0

    lw                t0, 0(a0)                /* t0 = dest 1 */
    lw                t1, 4(a0)                /* t1 = dest 2 */
    addu              AT, AT, t8               /* a 1 */
    addu              t7, t7, t8               /* a 2 */
    replv.ph          t2, AT
    replv.ph          t3, t7

    BYTE_MUL_x2 t0, t1, AT, t7, t2, t3, t9, t4, t5, t6, s0

    addiu             a1, 8
    sw                AT, 0(a0)
    sw                t7, 4(a0)
    bnez              a2, 11b
     addiu            a0, 8
    b                 3f
     nop

/* part where const_alpha = 255 */
2:
    lw                t2, 0(a1)                /* src 1 */
    lw                t3, 4(a1)                /* src 2 */
    not               t2, t2
    not               t3, t3
    lw                t0, 0(a0)                /* dest 1 */
    lw                t1, 4(a0)                /* dest 2 */
    srl               t4, t2, 24
    srl               t5, t3, 24
    replv.ph          t2, t4                   /* t2 = qAlpha(src 1) */
    replv.ph          t3, t5                   /* t3 = qAlpha(src 2) */
    addiu             a2, -2

    BYTE_MUL_x2 t0, t1, t8, AT, t2, t3, t9, t4, t5, t6, t7

    addiu             a1, 8
    sw                t8, 0(a0)
    sw                AT, 4(a0)
    bnez              a2, 2b
     addiu             a0, 8

3:
    lw                s0, 0(sp)
    addiu             sp, 4
    jr                ra
     nop
    .set              at

END(comp_func_DestinationOut_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_solid_SourceAtop_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - int length
 * a2 - uint color
 * a3 - uint sia
 */

    .set              noat
    addu              sp, -4
    sw                s0, 0(sp)
    beqz              a1, 2f
     nop
    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
    lui               t8, 0xff00
    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
1:
    lw                t0, 0(a0)                /* t0 = dest 1 */
    lw                t1, 4(a0)                /* t1 = dest 2 */
    addiu             a1, -2
    srl               t2, t0, 24               /* t2 = qAlpha(dest 1) */
    srl               t3, t1, 24               /* t3 = qAlpha(dest 2) */

    INTERPOLATE_PIXEL_255 a2, t2, t0, a3, AT, t9, t8, t4, t5, t6, t7
    INTERPOLATE_PIXEL_255 a2, t3, t1, a3, s0, t9, t8, t4, t5, t6, t7

    sw                AT, 0(a0)
    sw                s0, 4(a0)
    bnez              a1, 1b
     addiu            a0, 8
2:
    lw                s0, 0(sp)
    addiu             sp, 4
    jr                ra
     nop
    .set              at

END(comp_func_solid_SourceAtop_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_SourceAtop_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - const uint *src
 * a2 - int length
 * a3 - uint const_alpha
 */

    .set              noat
    addiu             sp, -20
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    sw                s2, 8(sp)
    sw                s3, 12(sp)
    sw                s4, 16(sp)
    beqz              a2, 3f
     nop
    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
    lui               t8, 0xff00
    li                t0, 0xff
    beq               a3, t0, 2f
     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */

/* part where const_alpha != 255 */
1:
    replv.ph          a3, a3
11:
    lw                AT, 0(a1)                /* src 1 */
    lw                s0, 4(a1)                /* src 2 */

    BYTE_MUL_x2 AT, s0, t0, t1, a3, a3, t9, t3, t4, t5, t6, 0
                                               /* t0 = s */

    lw                t2, 0(a0)                /* t2 = dest 1 */
    lw                t3, 4(a0)                /* t3 = dest 2 */

    srl               t4, t2, 24               /* t4 = qAplpha(dest 1) */
    srl               t5, t3, 24
    not               t6, t0
    not               t7, t1
    srl               t6, t6, 24               /* t6 = qAlpha(~s) */
    srl               t7, t7, 24
    addiu             a2, -2

    INTERPOLATE_PIXEL_255 t0, t4, t2, t6, AT, t9, t8, s1, s2, s3, s4
    INTERPOLATE_PIXEL_255 t1, t5, t3, t7, s0, t9, t8, s1, s2, s3, s4

    addiu             a1, 8
    sw                AT, 0(a0)
    sw                s0, 4(a0)
    bnez              a2, 11b
     addiu             a0, 8
    b                 3f
     nop

/* part where const_alpha = 255 */
2:
    lw                t2, 0(a0)                /* dest 1 */
    lw                t3, 4(a0)                /* dest 2 */
    lw                t0, 0(a1)                /* src 1 */
    lw                t1, 4(a1)                /* src 2 */
    srl               t4, t2, 24
    srl               t5, t3, 24
    not               t6, t0
    not               t7, t1
    srl               t6, t6, 24
    srl               t7, t7, 24
    addiu             a2, -2

    INTERPOLATE_PIXEL_255 t0, t4, t2, t6, AT, t9, t8, s1, s2, s3, s4
    INTERPOLATE_PIXEL_255 t1, t5, t3, t7, s0, t9, t8, s1, s2, s3, s4

    addiu             a1, 8
    sw                AT, 0(a0)
    sw                s0, 4(a0)
    bnez              a2, 2b
     addiu             a0, 8

3:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    lw                s2, 8(sp)
    lw                s3, 12(sp)
    lw                s4, 16(sp)
    addiu             sp, 20
    jr                 ra
     nop
    .set              at

END(comp_func_SourceAtop_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_solid_DestinationAtop_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - int length
 * a2 - uint color
 * a3 - uint a
 */

    .set              noat
    addiu             sp, -4
    sw                s0, 0(sp)
    beqz              a1, 2f
     nop
    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
    lui               t8, 0xff00
    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
1:
    lw                t0, 0(a0)                /* t0 = dest 1 */
    lw                t1, 4(a0)                /* t1 = dest 2 */
    addiu             a1, -2
    not               t2, t0
    not               t3, t1
    srl               t2, t2, 24               /* t2 = qAlpha(~(dest 1)) */
    srl               t3, t3, 24               /* t3 = qAlpha(~(dest 2)) */

    INTERPOLATE_PIXEL_255 t0, a3, a2, t2, AT, t9, t8, t4, t5, t6, t7
    INTERPOLATE_PIXEL_255 t1, a3, a2, t3, s0, t9, t8, t4, t5, t6, t7

    sw                AT, 0(a0)
    sw                s0, 4(a0)
    bnez              a1, 1b
     addiu            a0, 8
2:
    lw                s0, 0(sp)
    addiu              sp, 4
    jr                ra
     nop
    .set              at

END(comp_func_solid_DestinationAtop_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_DestinationAtop_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - const uint *src
 * a2 - int length
 * a3 - uint const_alpha
 */

    .set              noat
    addiu             sp, -24
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    sw                s2, 8(sp)
    sw                s3, 12(sp)
    sw                s4, 16(sp)
    sw                s5, 20(sp)
    beqz              a2, 3f
     nop
    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
    lui               t8, 0xff00
    li                t0, 0xff
    beq               a3, t0, 2f
     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */

/* part where const_alpha != 255 */
1:
    li                s5, 0xff
    subu              s5, s5, a3               /* s5 = cia = 255 - const_alpha */
    replv.ph          a3, a3
11:
    lw                AT, 0(a1)                /* src 1 */
    lw                s0, 4(a1)                /* src 2 */

    BYTE_MUL_x2 AT, s0, t0, t1, a3, a3, t9, t3, t4, t5, t6, 0
                                               /* t0 = s */

    lw                t2, 0(a0)                /* t2 = dest 1 */
    lw                t3, 4(a0)                /* t3 = dest 2 */

    not               t4, t2
    not               t5, t3
    srl               t4, t4, 24               /* t4 = qAplpha(~(dest 1)) */
    srl               t5, t5, 24
    srl               t6, t0, 24
    srl               t7, t1, 24
    addu              t6, t6, s5               /* t6 = a = qAlpha(s1) + cia */
    addu              t7, t7, s5
    addiu             a2, -2

    INTERPOLATE_PIXEL_255 t2, t6, t0, t4, AT, t9, t8, s1, s2, s3, s4
    INTERPOLATE_PIXEL_255 t3, t7, t1, t5, s0, t9, t8, s1, s2, s3, s4

    addiu             a1, 8
    sw                AT, 0(a0)
    sw                s0, 4(a0)
    bnez              a2, 11b
     addiu             a0, 8
    b                 3f
     nop

/* part where const_alpha = 255 */
2:
    lw                t2, 0(a0)                /* d1 */
    lw                t3, 4(a0)                /* d2 */
    lw                t0, 0(a1)                /* s1 */
    lw                t1, 4(a1)                /* s2 */
    srl               t4, t0, 24               /* t4 = qAlpha(s1) */
    srl               t5, t1, 24
    not               t6, t2
    not               t7, t3
    srl               t6, t6, 24               /* qAlpha(~d1) */
    srl               t7, t7, 24
    addiu             a2, -2

    INTERPOLATE_PIXEL_255 t2, t4, t0, t6, AT, t9, t8, s1, s2, s3, s4
    INTERPOLATE_PIXEL_255 t3, t5, t1, t7, s0, t9, t8, s1, s2, s3, s4

    addiu             a1, 8
    sw                AT, 0(a0)
    sw                s0, 4(a0)
    bnez              a2, 2b
     addiu             a0, 8

3:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    lw                s2, 8(sp)
    lw                s3, 12(sp)
    lw                s4, 16(sp)
    lw                s5, 20(sp)
    addiu             sp, 24
    jr                ra
     nop
    .set              at

END(comp_func_DestinationAtop_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_solid_XOR_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - int length
 * a2 - uint color
 * a3 - uint sia
 */

    .set              noat
    addu              sp, -4
    sw                s0, 0(sp)
    beqz              a1, 2f
     nop
    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
    lui               t8, 0xff00
    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
1:
    lw                t0, 0(a0)                /* t0 = dest 1 */
    lw                t1, 4(a0)                /* t1 = dest 2 */
    addiu             a1, -2
    not               t2, t0
    not               t3, t1
    srl               t2, t2, 24               /* t2 = qAlpha(~(dest 1)) */
    srl               t3, t3, 24               /* t3 = qAlpha(~(dest 2)) */

    INTERPOLATE_PIXEL_255 a2, t2, t0, a3, AT, t9, t8, t4, t5, t6, t7
    INTERPOLATE_PIXEL_255 a2, t3, t1, a3, s0, t9, t8, t4, t5, t6, t7

    sw                AT, 0(a0)
    sw                s0, 4(a0)
    bnez              a1, 1b
     addiu            a0, 8
2:
    lw                s0, 0(sp)
    addu              sp, 4
    jr                ra
     nop
    .set              at

END(comp_func_solid_XOR_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_XOR_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - const uint *src
 * a2 - int length
 * a3 - uint const_alpha
 */

    .set              noat
    addiu             sp, -20
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    sw                s2, 8(sp)
    sw                s3, 12(sp)
    sw                s4, 16(sp)
    beqz              a2, 3f
     nop
    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
    lui               t8, 0xff00
    li                t0, 0xff
    beq               a3, t0, 2f
     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */

/* part where const_alpha != 255 */
1:
    replv.ph          a3, a3
11:
    lw                AT, 0(a1)                /* src 1 */
    lw                s0, 4(a1)                /* src 2 */

    BYTE_MUL_x2 AT, s0, t0, t1, a3, a3, t9, t3, t4, t5, t6, 0
                                               /* t0 = s1 */
                                               /* t1 = s2 */

    lw                t2, 0(a0)                /* t2 = dest 1 */
    lw                t3, 4(a0)                /* t3 = dest 2 */

    not               t4, t2
    not               t5, t3
    srl               t4, t4, 24               /* t4 = qAplpha(~(dest 1)) */
    srl               t5, t5, 24
    not               t6, t0
    not               t7, t1
    srl               t6, t6, 24               /* t6 = qAlpha(~s) */
    srl               t7, t7, 24
    addiu             a2, -2

    INTERPOLATE_PIXEL_255 t0, t4, t2, t6, AT, t9, t8, s1, s2, s3, s4
    INTERPOLATE_PIXEL_255 t1, t5, t3, t7, s0, t9, t8, s1, s2, s3, s4

    addiu             a1, 8
    sw                AT, 0(a0)
    sw                s0, 4(a0)
    bnez              a2, 11b
     addiu             a0, 8
    b                 3f
     nop

/* part where const_alpha = 255 */
2:
    lw                t2, 0(a0)                /* d1 */
    lw                t3, 4(a0)                /* d2 */
    lw                t0, 0(a1)                /* s1 */
    lw                t1, 4(a1)                /* s2 */
    not               t4, t0
    not               t5, t1
    srl               t4, t4, 24               /* t4 = qAlpha(~s1) */
    srl               t5, t5, 24
    not               t6, t2
    not               t7, t3
    srl               t6, t6, 24               /* qAlpha(~d1) */
    srl               t7, t7, 24
    addiu             a2, -2

    INTERPOLATE_PIXEL_255 t0, t6, t2, t4, AT, t9, t8, s1, s2, s3, s4
    INTERPOLATE_PIXEL_255 t1, t7, t3, t5, s0, t9, t8, s1, s2, s3, s4

    addiu             a1, 8
    sw                AT, 0(a0)
    sw                s0, 4(a0)
    bnez              a2, 2b
     addiu             a0, 8

3:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    lw                s2, 8(sp)
    lw                s3, 12(sp)
    lw                s4, 16(sp)
    addiu             sp, 20
    jr                ra
     nop
    .set              at

END(comp_func_XOR_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_solid_SourceOut_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - int length
 * a2 - uint color
 * a3 - uint const_alpha
 */

    .set              noat
    addiu             sp, -12
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    sw                s2, 8(sp)
    beqz              a1, 3f
     nop
    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
    lui               t8, 0xff00
    li                t0, 0xff
    beq               a3, t0, 2f
     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */

/* part where const_alpha != 255 */
1:
    replv.ph          t0, a3
    li                t5, 0xff
    BYTE_MUL a2, a2, t0, t9, t1, t2, t3, t4    /* a2 = color ( = BYTE_MUL(color, const_alpha)); */
    subu              t1, t5, a3               /* t1 = cia = 255 - const_alpha */
11:
    lw                t2, 0(a0)                /* t2 = d1 */
    lw                s0, 4(a0)                /* s0 = d2 */
    addiu             a1, -2
    not               t3, t2
    not               s2, s0
    srl               t3, t3, 24               /* t3 = qAlpha(~d1) */
    srl               s2, s2, 24               /* s2 = qAlpha(~d2) */

    INTERPOLATE_PIXEL_255 a2, t3, t2, t1, AT, t9, t8, t4, t5, t6, t7
    INTERPOLATE_PIXEL_255 a2, s2, s0, t1, s1, t9, t8, t4, t5, t6, t7

    sw                AT, 0(a0)
    sw                s1, 4(a0)
    bnez              a1, 11b
     addiu            a0, 8
    b                 3f
     nop

/* part where const_alpha = 255 */
2:
    lw                t0, 0(a0)                /* dest 1 */
    lw                t1, 4(a0)                /* dest 2 */
    not               t4, t0
    not               t5, t1
    srl               t4, t4, 24
    srl               t5, t5, 24
    replv.ph          t2, t4
    replv.ph          t3, t5
    addiu             a1, -2

    BYTE_MUL_x2 a2, a2, t8, AT, t2, t3, t9, t4, t5, t6, t7

    sw                t8, 0(a0)
    sw                AT, 4(a0)
    bnez              a1, 2b
     addiu             a0, 8

3:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    lw                s2, 8(sp)
    addiu             sp, 12
    jr                ra
     nop
    .set              at

END(comp_func_solid_SourceOut_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_SourceOut_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - const uint *src
 * a2 - int length
 * a3 - uint const_alpha
 */

    .set              noat
    addiu             sp, -16
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    sw                s2, 8(sp)
    sw                s3, 12(sp)
    beqz              a2, 3f
     nop
    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
    lui               t8, 0xff00
    li                t0, 0xff
    beq               a3, t0, 2f
     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */

/* part where const_alpha != 255 */
1:
    li                t5, 0xff
    subu              t7, t5, a3               /* t7 = cia = 255 - const_alpha */
    replv.ph          a3, a3
11:
    lw                t0, 0(a1)                /* t0 = src 1 */
    lw                t1, 4(a1)                /* t1 = src 2 */
    addiu             a2, -2

    BYTE_MUL_x2 t0, t1, AT, s0, a3, a3, t9, t3, t4, t5, t6, 0

    lw                t0, 0(a0)                /* t0 = dest 1 */
    lw                t1, 4(a0)                /* t1 = dest 2 */
    addiu             a1, 8

    not               t2, t0
    not               t3, t1
    srl               t2, t2, 24               /* t2 = qAlpha(~d1) */
    srl               t3, t3, 24               /* t3 = qAlpha(~d2) */

    INTERPOLATE_PIXEL_255 AT, t2, t0, t7, s1, t9, t8, t4, t5, t6, s3
    INTERPOLATE_PIXEL_255 s0, t3, t1, t7, s2, t9, t8, t4, t5, t6, s3

    sw                s1, 0(a0)
    sw                s2, 4(a0)
    bnez              a2, 11b
     addiu            a0, 8
    b                 3f
     nop

/* part where const_alpha = 255 */
2:
    lw                t2, 0(a0)                /* dest 1 */
    lw                t3, 4(a0)                /* dest 2 */
    lw                t0, 0(a1)                /* src 1 */
    lw                t1, 4(a1)                /* src 2 */
    not               t4, t2
    not               t5, t3
    srl               t4, t4, 24               /* qAlpha(~d1) */
    srl               t5, t5, 24               /* qAlpha(~d2) */
    replv.ph          t2, t4
    replv.ph          t3, t5
    addiu             a2, -2

    BYTE_MUL_x2 t0, t1, t8, AT, t2, t3, t9, t4, t5, t6, t7

    addiu             a1, 8
    sw                t8, 0(a0)
    sw                AT, 4(a0)
    bnez              a2, 2b
     addiu             a0, 8

3:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    lw                s2, 8(sp)
    lw                s3, 12(sp)
    addiu             sp, 16
    jr                 ra
     nop
    .set              at

END(comp_func_SourceOut_dsp_asm_x2)

LEAF_MIPS_DSP(comp_func_Source_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - const uint *src
 * a2 - int length
 * a3 - uint const_alpha
 */

    .set              noat
    addiu             sp, -8
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    beqz              a2, 2f
     nop
    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
    lui               t8, 0xff00
    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
    li                t7, 0xff
    subu              t7, t7, a3               /* t7 = ialpha */
1:
    lw                t0, 0(a0)                /* t0 = dest 1 */
    lw                t1, 4(a0)                /* t1 = dest 2 */
    lw                t2, 0(a1)                /* t2 = src 1 */
    lw                t3, 4(a1)                /* t3 = src 2 */
    addiu             a2, -2
    addiu             a1, 8

    INTERPOLATE_PIXEL_255 t2, a3, t0, t7, AT, t9, t8, t4, t5, t6, s1
    INTERPOLATE_PIXEL_255 t3, a3, t1, t7, s0, t9, t8, t4, t5, t6, s1

    sw                AT, 0(a0)
    sw                s0, 4(a0)
    bnez              a2, 1b
     addiu            a0, 8
2:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    addiu             sp, 8
    jr                ra
     nop
    .set              at

END(comp_func_Source_dsp_asm_x2)

LEAF_MIPS_DSP(qt_blend_argb32_on_argb32_mips_dsp_asm_x2)
/*
 * a0 - uint *dest
 * a1 - const uint *src
 * a2 - int length
 * a3 - uint const_alpha
 */

    .set              noat
    addiu             sp, -12
    sw                s0, 0(sp)
    sw                s1, 4(sp)
    sw                s2, 8(sp)
    beqz              a2, 2f
     nop
    replv.ph          a3, a3
    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */

1:
    lw                t0, 0(a1)                /* t0 = src 1 */
    lw                t1, 4(a1)                /* t1 = src 2 */
    addiu             a2, -2

    BYTE_MUL_x2       t0, t1, AT, t7, a3, a3, t9, t3, t4, t5, t6, 0

    lw                t0, 0(a0)                /* t0 = dest 1 */
    lw                t1, 4(a0)                /* t1 = dest 2 */
    not               s1, AT
    not               s2, t7
    srl               s1, s1, 24               /* s1 = qAlpha(~s1) */
    srl               s2, s2, 24               /* s2 = qAlpha(~s2) */
    replv.ph          s1, s1
    replv.ph          s2, s2

    BYTE_MUL_x2 t0, t1, t2, t3, s1, s2, t9, t4, t5, t6, s0

    addiu             a1, 8
    addu              AT, AT, t2
    addu              t7, t7, t3
    sw                AT, 0(a0)
    sw                t7, 4(a0)
    bnez              a2, 1b
     addiu            a0, 8

2:
    lw                s0, 0(sp)
    lw                s1, 4(sp)
    lw                s2, 8(sp)
    addiu             sp, 12
    jr                ra
     nop
    .set              at

END(qt_blend_argb32_on_argb32_mips_dsp_asm_x2)

LEAF_MIPS_DSP(qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm)
/*
 * a0 - uint *dest
 * a1 - const uint *src
 * a2 - int length
 */

    beqz              a2, 5f
     nop
    li                t7, 8388736    /* t7 = 0x800080 */
    b                 2f
     nop
1:
    addiu             a0, a0, 4
    addiu             a2, a2, -1
    beqz              a2, 5f
     nop
2:
    lw                t0, 0(a1)      /* t0 = s = src[i] */
    addiu             a1, a1, 4
    nor               t1, t0, zero
    srl               t1, t1, 24     /* t1 = ~qAlpha(s) */
    bnez              t1, 3f
     nop
    sw                t0, 0(a0)      /* dst[i] = src[i] */
    addiu             a2, a2, -1
    bnez              a2, 2b
     addiu            a0, a0, 4
    b 5f
     nop
3:
    beqz              t0, 1b
     replv.ph          t6, t1        /* | 0 | qAlpha(~s) | 0 | qAlpha(~s) | */

    lw                t4, 0(a0)
    addiu             a2, a2, -1
    beqz              t4, 31f
     move             t8, zero

    BYTE_MUL t4, t8, t6, t7, t1, t2, t3, t4
31:
    addu              t8, t0, t8    /* dst[i] =
                                     * s + BYTE_MUL(dst[i],~qAlpha(s)) */
    sw                t8, 0(a0)
    bnez              a2, 2b
     addiu            a0, a0, 4
    b                 5f
     nop
5:
    jr                ra
     nop

END(qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm)


#if defined(__MIPSEL) && __MIPSEL
# define PACK(r, s, t)  packrl.ph r, s, t
# define SWHI(r, o, b)  swl r, o + 1 (b)
# define SWLO(r, o, b)  swr r, o + 0 (b)
# define LDHI(r, o, b)  lwl r, o + 1 (b)
# define LDLO(r, o, b)  lwr r, o + 2 (b)
#else
# define PACK(r, s, t)  packrl.ph r, t, s
# define SWHI(r, o, b)  swr r, o + 1 (b)
# define SWLO(r, o, b)  swl r, o + 0 (b)
# define LDHI(r, o, b)  lwr r, o + 1 (b)
# define LDLO(r, o, b)  lwl r, o + 2 (b)
#endif

LEAF_MIPS_DSP(qt_blend_rgb16_on_rgb16_const_alpha_256_mips_dsp_asm)
/*
 * a0 - dst (*r5g6b5)
 * a1 - src (const *r5g6b5)
 * a2 - len (unsigned int)
 *
 * Register usage:
 *  t0-3 - Scratch registers
 *  t4   - Number of iterations to do in unrolled loops
 *  t5-7 - Auxiliary scratch registers.
 *
 * Check if base addresses of src/dst are aligned, cases:
 *  a) Both aligned.
 *  b) Both unaligned:
 *      1. Copy a halfword
 *      2. Use aligned case.
 *  c) dst aligned, src unaligned:
 *      1. Read a word from dst, halfword from src.
 *      2. Continue reading words from both.
 *  d) dst unaligned, src aligned:
 *      1. Read a word from src, halfword from dst.
 *      2. Continue reading words from both.
 */

    beqz   a2, 0f       /* if (a2:len == 0): return */
     andi  t0, a0, 0x3  /* t0 = a0:dst % 4 */
    andi   t1, a1, 0x3  /* t1 = a1:dst % 4 */
    or     t2, t0, t1   /* t1 = t0 | t1 */

    beqz   t2, 4f       /* both aligned */
     nop
    beqz   t0, 3f       /* dst aligned, src unaligned */
     nop
    beqz   t1, 2f       /* src aligned, dst unaligned */
     nop

    /*
     * Both src/dst are unaligned: read 1 halfword from each,
     * the fall-off to continue with word-aligned copy.
     */
    lhu    t0, 0 (a1)    /* t0 <- ((uint16_t*) src)[0] */
    addiu  a1, a1, 2     /* src++ */
    addiu  a2, a2,-1     /* len-- */
    sh     t0, 0 (a0)    /* t1 -> ((uint16_t*) dst)[0] */
    addiu  a0, a0, 2     /* dst++ */

    /*
     * Both src/dst pointers are word-aligned, process eight
     * items at a time in an unrolled loop.
     */
4:  beqz   a2, 0f        /* if (len == 0): return */
     srl   t4, a2, 3     /* t4 = len / 8 */

    beqz   t4, 5f        /* if (t4 == 0): tail */
     andi  a2, a2, 0x07  /* len = len % 8 */

1:  lw     t0,  0 (a1)
    lw     t1,  4 (a1)
    lw     t2,  8 (a1)
    lw     t3, 12 (a1)

    addiu  t4, t4, -1     /* t4-- */
    addiu  a1, a1, 16     /* src += 8 */

    sw     t0,  0 (a0)
    sw     t1,  4 (a0)
    sw     t2,  8 (a0)
    sw     t3, 12 (a0)

    bnez   t4, 1b
     addiu a0, a0, 16     /* dst += 8 */

    b 5f
    nop


    /*
     * dst pointer is unaligned
     */
2:  beqz   a2, 0f        /* if (len == 0): return */
     srl   t4, a2, 3     /* t4 = len / 8 */
    beqz   t4, 5f        /* if (t4 == 0): tail */
     andi  a2, a2, 0x07  /* len = len % 8 */

1:  lw     t0,  0 (a1)
   lw     t1,  4 (a1)
    lw     t2,  8 (a1)
    lw     t3, 12 (a1)

    addiu  t4, t4, -1    /* t4-- */
    addiu  a1, a1, 16    /* src += 8 */

    SWLO  (t0,  0, a0)
    PACK  (t5, t1, t0)
    PACK  (t6, t2, t1)
    PACK  (t7, t3, t2)
    SWHI  (t3, 14, a0)
    sw     t5,  2 (a0)
    sw     t6,  6 (a0)
    sw     t7, 10 (a0)

    bnez   t4, 1b
     addiu a0, a0, 16    /* dst += 8 */

    b 5f
     nop

    /*
     * src pointer is unaligned
     */
3:  beqz   a2, 0f        /* if (len == 0): return */
     srl   t4, a2, 3     /* t4 = len / 8 */
    beqz   t4, 5f        /* if (t4 == 0): tail */
     andi  a2, a2, 0x07  /* len = len % 8 */

1:  LDHI  (t0,  0, a1)
    lw     t1,  2 (a1)
    lw     t2,  6 (a1)
    lw     t3, 10 (a1)
    LDLO  (t5, 12, a1)

    addiu  t4, t4, -1    /* t4-- */
    addiu  a1, a1, 16    /* src += 8 */

    PACK  (t0, t1, t0)
    PACK  (t6, t2, t1)
    PACK  (t7, t3, t2)
    sw     t0,  0 (a0)
    PACK  (t0, t5, t3)
    sw     t6,  4 (a0)
    sw     t7,  8 (a0)
    sw     t0, 12 (a0)

    bnez   t4, 1b
     addiu a0, a0, 16    /* dst += 8 */


5:  /* Process remaining items (a2:len < 4), one at a time */
    beqz   a2, 0f
     nop

1:  lhu    t0, 0 (a1)  /* t0 <- ((uint16_t*) src)[0] */
    addiu  a2, a2,-1   /* len-- */
    addiu  a1, a1, 2   /* src++ */
    sh     t0, 0 (a0)  /* to -> ((uint16_t*) dst)[0] */
    bnez   a2, 1b      /* if (len != 0): loop */
     addiu a0, a0, 2   /* dst++ */

0:  jr ra
     nop

END(qt_blend_rgb16_on_rgb16_const_alpha_256_mips_dsp_asm)


#undef LDHI
#undef LDLO
#undef PACK
#undef SWHI
#undef SWLO


LEAF_MIPS_DSP(qt_blend_rgb16_on_rgb16_mips_dsp_asm)
/*
 * a0 - dst (*r5g6b5)
 * a1 - src (const *r5g6b5)
 * a2 - len (unsigned int) - batch length
 * a3 - alpha (int)
 */

    beqz    a2, 2f
     li     t9, 255
    sll     t8, a3, 8
    subu    a3, t8, a3
    srl     a3, a3, 8
    subu    t9, t9, a3
    addiu   a3, a3, 1
    srl     t4, a3, 2
    addiu   t9, t9, 1
    srl     t5, t9, 2
1:
    lhu     t0, 0(a1)
    lhu     t1, 0(a0)
    addiu   a2, a2, -1
    andi    t2, t0, 0x07e0
    andi    t0, t0, 0xf81f
    mul     t2, t2, a3
    mul     t0, t0, t4
    andi    t3, t1, 0x07e0
    andi    t1, t1, 0xf81f
    mul     t3, t3, t9
    mul     t1, t1, t5
    addiu   a1, a1, 2
    srl     t2, t2, 8
    srl     t0, t0, 6
    andi    t2, t2, 0x07e0
    andi    t0, t0, 0xf81f
    or      t0, t0, t2
    srl     t3, t3, 8
    srl     t1, t1, 6
    andi    t3, t3, 0x07e0
    andi    t1, t1, 0xf81f
    or      t1, t1, t3
    addu    t0, t0, t1
    sh      t0, 0(a0)
    bgtz    a2, 1b
     addiu  a0, a0, 2
2:
    jr      ra
     nop

END(qt_blend_rgb16_on_rgb16_mips_dsp_asm)


LEAF_MIPS_DSP(fetchUntransformed_888_asm_mips_dsp)
/*
 * a0 - dst address (address of 32-bit aRGB value)
 * a1 - src address
 * a2 - length
 */

    beqz       a2, 4f
     lui       t8, 0xff00
    andi       t0, a2, 0x1
    beqz       t0, 1f
     nop
/* case for one pixel */
    lbu        t1, 0(a1)
    lbu        v1, 2(a1)
    lbu        t0, 1(a1)
    addiu      a1, a1, 3
    addiu      a2, a2, -1
    sll        t1, t1, 0x10
    or         v1, v1, t8
    sll        t0, t0, 0x8
    or         v1, v1, t1
    or         v1, v1, t0
    sw         v1, 0(a0)
    addiu      a0, a0, 4

    beqz       a2, 4f        /* only one pixel is present (length = 1) */
     nop
1:
    andi       t0, a1, 0x1
    beqz       t0, 3f
     nop
2:
    lbu        t0, 0(a1)     /* t0 = | 0 | 0 | 0 | R1 | */
    lhu        t1, 1(a1)     /* t1 = | 0 | 0 | B1 | G1 | */
    addiu      a1, a1, 3
    lhu        t2, 0(a1)     /* t2 = | 0 | 0 | G2 | R2 | */
    lbu        t3, 2(a1)     /* t3 = | 0 | 0 | 0 | B2 | */

    sll        t0, t0, 16
    or         t0, t0, t8    /* t0 = | ff | R1 | 0 | 0 | */
    shll.ph    t4, t1, 8     /* t4 = | 0 | 0 | G1 | 0 | */
    srl        t5, t1, 8
    or         t4, t4, t5    /* t4 = | 0 | 0 | G1 | B1 | */
    or         t0, t0, t4    /* t0 = | ff | R1 | G1 | B1 | */

    shll.ph    t4, t2, 8     /* t4 = | 0 | 0 | R2 | 0 | */
    srl        t5, t2, 8     /* t5 = | 0 | 0 | 0 | G2 | */
    or         t4, t4, t5
    sll        t4, t4, 8     /* t4 = | 0 | R2 | G2 | 0 | */
    or         t5, t3, t8
    or         t2, t4, t5    /* t2 = | ff | R2 | G2 | B2 | */

    sw         t0, 0(a0)
    addiu      a1, a1, 3
    sw         t2, 4(a0)
    addiu      a2, a2, -2
    bnez       a2, 2b
     addiu     a0, a0, 8
    b          4f
     nop
3:
    lhu        t0, 0(a1)     /* t0 = | 0 | 0 | G1 | R1 | */
    lbu        t1, 2(a1)     /* t1 = | 0 | 0 | 0 | B1 | */
    addiu      a1, a1, 3
    lbu        t2, 0(a1)     /* t2 = | 0 | 0 | 0 | R2 | */
    lhu        t3, 1(a1)     /* t3 = | 0 | 0 | B2 | G2 | */

    srl        t4, t0, 8     /* t4 = | 0 | 0 | 0 | G1 | */
    shll.ph    t5, t0, 8     /* t5 = | 0 | 0 | R1 | 0 | */
    or         t0, t4, t5
    sll        t6, t0, 8     /* t6 = | 0 | R1 | G1 | 0 | */
    or         t4, t1, t8    /* t4 = | ff | 0 | 0 | B1 | */
    or         t0, t6, t4

    sll        t2, t2, 16
    srl        t4, t3, 8
    shll.ph    t5, t3, 8
    or         t3, t4, t5
    or         t2, t2, t3
    or         t2, t2, t8

    sw         t0, 0(a0)
    addiu      a1, a1, 3
    sw         t2, 4(a0)
    addiu      a2, a2, -2
    bnez       a2, 3b
     addiu     a0, a0, 8
4:
    jr         ra
     nop

END(fetchUntransformed_888_asm_mips_dsp)


LEAF_MIPS_DSP(fetchUntransformed_444_asm_mips_dsp)
/*
 * a0 - dst address (address of 32-bit aRGB value)
 * a1 - src address
 * a2 - length
 */

    lui              t8, 0xff00
    li               t4, 0x1

    beqz             a2, 5f
     move            v0, a0         /* just return the address of buffer
                                     * for storing returning values */
    andi             t0, a2, 0x1
    beqz             t0, 2f         /* there is more then one pixel
                                     * (check src memory alignment (word)) */
     nop
1:
    lhu              v0, 0(a1)
    addiu            a1, a1, 2
    addiu            a2, a2, -1
    andi             t0, v0, 0xf00
    andi             v1, v0, 0xf
    andi             v0, v0, 0xf0
    sra              t3, t0, 0x4
    sra              t1, v0, 0x4
    sra              t0, t0, 0x8
    sll              t2, v1, 0x4
    or               t0, t0, t3
    or               v0, t1, v0
    lui              t1, 0xff00
    or               v1, t2, v1
    sll              t0, t0, 0x10
    or               v1, v1, t1
    sll              v0, v0, 0x8
    or               v1, v1, t0
    or               v0, v1, v0
    sw               v0, 0(a0)
    addiu            a0, a0, 4
    beqz             a2, 5f         /* no more pixels for processing */
     nop
    beq              a2, t4, 4f     /* only one more pixel remained */
     nop
/* check if src memory address is word aligned */
2:
    andi             t0, a1, 0x3
    beqz             t0, 3f         /* memory is word aligned */
     andi            a3, a2, 0x1    /* set the a3 register as the comparation
                                     * for ending the unrolled loop
                                     * (1 if odd, 0 if even) */
    b                1b             /* not word aligned,
                                     * go another turn with
                                     * just one pixel processing */
     nop
3:
    lw               t0, 0(a1)
    addiu            a2, a2, -2
    preceu.ph.qbr    t1, t0         /* t1 = | 0 | aR1 | 0 | G1B1 | */
    preceu.ph.qbl    t2, t0         /* t1 = | 0 | aR2 | 0 | G2B2 | */
    shll.qb          t3, t1, 4      /* t3 = | 0 | R1 0 | 0 | B1 0 | */
    srl              t4, t3, 4
    or               t0, t3, t4     /* t0 = | 0 | R1R1 | 0 | B1B1 | */
    andi             t3, t1, 0xf0
    sll              t3, t3, 8
    srl              t4, t3, 4
    or               t1, t3, t4
    or               t0, t0, t1     /* t0 = | 0 | R1R1 | G1G1 | B1B1 | */
    or               t0, t0, t8     /* t0 = | ff | R1R1 | G1G1 | B1B1 | */

    shll.qb          t3, t2, 4      /* t3 = | 0 | R1 0 | 0 | B1 0 | */
    srl              t4, t3, 4
    or               t7, t3, t4     /* t0 = | 0 | R1R1 | 0 | B1B1 | */
    andi             t3, t2, 0xf0
    sll              t3, t3, 8
    srl              t4, t3, 4
    or               t1, t3, t4
    or               t2, t7, t1     /* t0 = | 0 | R1R1 | G1G1 | B1B1 | */
    or               t2, t2, t8     /* t0 = | ff | R1R1 | G1G1 | B1B1 | */

    sw               t0, 0(a0)
    addiu            a1, a1, 4
    sw               t2, 4(a0)
    bne              a2, a3, 3b
     addiu           a0, a0, 8
    beqz             a2, 5f         /* no more pixels for processing */
     nop
4:
/* one more pixel remained (after loop unrolling process finished) */
    lhu              v0, 0(a1)
    addiu            a1, a1, 2
    addiu            a2, a2, -1
    andi             t0, v0, 0xf00
    andi             v1, v0, 0xf
    andi             v0, v0, 0xf0
    sra              t3, t0, 0x4
    sra              t1, v0, 0x4
    sra              t0, t0, 0x8
    sll              t2, v1, 0x4
    or               t0, t0, t3
    or               v0, t1, v0
    lui              t1, 0xff00
    or               v1, t2, v1
    sll              t0, t0, 0x10
    or               v1, v1, t1
    sll              v0, v0, 0x8
    or               v1, v1, t0
    or               v0, v1, v0
    sw               v0, 0(a0)
    addiu            a0, a0, 4
5:
    jr               ra
     nop

END(fetchUntransformed_444_asm_mips_dsp)


LEAF_MIPS_DSP(fetchUntransformed_argb8565_premultiplied_asm_mips_dsp)
/*
 * a0 - dst address
 * a1 - src address
 * a2 - length
 */

    beqz      a2, 2f
     nop

1:
    ulh       t1, 0(a1)
    lbu       t2, 2(a1)
    addiu     a2, a2, -1
    wsbh      t1, t1
    sll       t0, t1, 8       /* t0 = 00000000rrrrrggggggbbbbb00000000 */
    ins       t0, t1, 3, 16   /* t0 = 00000000rrrrrrrrrrggggggbbbbb000 */
    ins       t0, t1, 5, 11   /* t0 = 00000000rrrrrrrrggggggbbbbbbb000 */
    srl       t4, t1, 9       /* t4 = 0000000000000000000000000rrrrrgg */
    replv.qb  t3, t2
    ins       t0, t4, 8, 2    /* t0 = 00000000rrrrrrrrggggggggbbbbb000 */
    ins       t0, t1, 3, 5    /* t0 = 00000000rrrrrrrrggggggggbbbbb000 */
    srl       t4, t1, 2       /* t4 = 000000000000000000rrrrrggggggbbb */
    ins       t0, t4, 0, 3    /* t0 = 00000000rrrrrrrrggggggggbbbbbbbb */
    ins       t0, t2, 24, 8   /* t0 =aaaaaaaarrrrrrrrggggggggbbbbbbbb */
    cmpu.lt.qb t3, t0
    pick.qb   t0, t3, t0
    addiu     a1, a1, 3
    sw        t0, 0(a0)
    bgtz      a2, 1b
     addiu    a0, a0, 4
2:
    jr        ra
     nop

END(fetchUntransformed_argb8565_premultiplied_asm_mips_dsp)
