
/* FF is big and ugly so feel free to write lines as long as you like.
 * Aieeeeeeeee !
 *
 * Let me make that clearer:
 * Aieeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee ! !! !!!
 */

#include "device9.h"
#include "basetexture9.h"
#include "vertexdeclaration9.h"
#include "vertexshader9.h"
#include "pixelshader9.h"
#include "nine_ff.h"
#include "nine_defines.h"
#include "nine_helpers.h"
#include "nine_pipe.h"
#include "nine_dump.h"

#include "pipe/p_context.h"
#include "tgsi/tgsi_ureg.h"
#include "tgsi/tgsi_dump.h"
#include "util/u_box.h"
#include "util/u_hash_table.h"

#define NINE_TGSI_LAZY_DEVS 1

#define DBG_CHANNEL DBG_FF

#define NINE_FF_NUM_VS_CONST 256
#define NINE_FF_NUM_PS_CONST 24

#define NINED3DTSS_TCI_DISABLE                       0
#define NINED3DTSS_TCI_PASSTHRU                      1
#define NINED3DTSS_TCI_CAMERASPACENORMAL             2
#define NINED3DTSS_TCI_CAMERASPACEPOSITION           3
#define NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR   4
#define NINED3DTSS_TCI_SPHEREMAP                     5

struct fvec4
{
    float x, y, z, w;
};

struct nine_ff_vs_key
{
    union {
        struct {
            uint32_t position_t : 1;
            uint32_t lighting   : 1;
            uint32_t darkness   : 1; /* lighting enabled but no active lights */
            uint32_t localviewer : 1;
            uint32_t vertexpointsize : 1;
            uint32_t pointscale : 1;
            uint32_t vertexblend : 3;
            uint32_t vertexblend_indexed : 1;
            uint32_t vertextween : 1;
            uint32_t mtl_diffuse : 2; /* 0 = material, 1 = color1, 2 = color2 */
            uint32_t mtl_ambient : 2;
            uint32_t mtl_specular : 2;
            uint32_t mtl_emissive : 2;
            uint32_t fog_mode : 2;
            uint32_t fog_range : 1;
            uint32_t color0in_one : 1;
            uint32_t color1in_one : 1;
            uint32_t pad1 : 8;
            uint32_t tc_gen : 24; /* 8 * 3 bits */
            uint32_t pad2 : 8;
            uint32_t tc_idx : 24;
            uint32_t pad3 : 8;
            uint32_t tc_dim : 24; /* 8 * 3 bits */
            uint32_t pad4 : 8;
        };
        uint64_t value64[2]; /* don't forget to resize VertexShader9.ff_key */
        uint32_t value32[4];
    };
};

/* Texture stage state:
 *
 * COLOROP       D3DTOP 5 bit
 * ALPHAOP       D3DTOP 5 bit
 * COLORARG0     D3DTA  3 bit
 * COLORARG1     D3DTA  3 bit
 * COLORARG2     D3DTA  3 bit
 * ALPHAARG0     D3DTA  3 bit
 * ALPHAARG1     D3DTA  3 bit
 * ALPHAARG2     D3DTA  3 bit
 * RESULTARG     D3DTA  1 bit (CURRENT:0 or TEMP:1)
 * TEXCOORDINDEX 0 - 7  3 bit
 * ===========================
 *                     32 bit per stage
 */
struct nine_ff_ps_key
{
    union {
        struct {
            struct {
                uint32_t colorop   : 5;
                uint32_t alphaop   : 5;
                uint32_t colorarg0 : 3;
                uint32_t colorarg1 : 3;
                uint32_t colorarg2 : 3;
                uint32_t alphaarg0 : 3;
                uint32_t alphaarg1 : 3;
                uint32_t alphaarg2 : 3;
                uint32_t resultarg : 1; /* CURRENT:0 or TEMP:1 */
                uint32_t textarget : 2; /* 1D/2D/3D/CUBE */
                uint32_t projected : 1;
                /* that's 32 bit exactly */
            } ts[8];
            uint32_t fog : 1; /* for vFog with programmable VS */
            uint32_t fog_mode : 2;
            uint32_t specular : 1; /* 9 32-bit words with this */
            uint8_t colorarg_b4[3];
            uint8_t colorarg_b5[3];
            uint8_t alphaarg_b4[3]; /* 11 32-bit words plus a byte */
        };
        uint64_t value64[6]; /* don't forget to resize PixelShader9.ff_key */
        uint32_t value32[12];
    };
};

static unsigned nine_ff_vs_key_hash(void *key)
{
    struct nine_ff_vs_key *vs = key;
    unsigned i;
    uint32_t hash = vs->value32[0];
    for (i = 1; i < Elements(vs->value32); ++i)
        hash ^= vs->value32[i];
    return hash;
}
static int nine_ff_vs_key_comp(void *key1, void *key2)
{
    struct nine_ff_vs_key *a = (struct nine_ff_vs_key *)key1;
    struct nine_ff_vs_key *b = (struct nine_ff_vs_key *)key2;

    return memcmp(a->value64, b->value64, sizeof(a->value64));
}
static unsigned nine_ff_ps_key_hash(void *key)
{
    struct nine_ff_ps_key *ps = key;
    unsigned i;
    uint32_t hash = ps->value32[0];
    for (i = 1; i < Elements(ps->value32); ++i)
        hash ^= ps->value32[i];
    return hash;
}
static int nine_ff_ps_key_comp(void *key1, void *key2)
{
    struct nine_ff_ps_key *a = (struct nine_ff_ps_key *)key1;
    struct nine_ff_ps_key *b = (struct nine_ff_ps_key *)key2;

    return memcmp(a->value64, b->value64, sizeof(a->value64));
}
static unsigned nine_ff_fvf_key_hash(void *key)
{
    return *(DWORD *)key;
}
static int nine_ff_fvf_key_comp(void *key1, void *key2)
{
    return *(DWORD *)key1 != *(DWORD *)key2;
}

static void nine_ff_prune_vs(struct NineDevice9 *);
static void nine_ff_prune_ps(struct NineDevice9 *);

static void nine_ureg_tgsi_dump(struct ureg_program *ureg, boolean override)
{
    if (debug_get_bool_option("NINE_FF_DUMP", FALSE) || override) {
        unsigned count;
        const struct tgsi_token *toks = ureg_get_tokens(ureg, &count);
        tgsi_dump(toks, 0);
        ureg_free_tokens(toks);
    }
}

#define _X(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_X)
#define _Y(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Y)
#define _Z(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Z)
#define _W(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_W)

#define _XXXX(r) ureg_scalar(r, TGSI_SWIZZLE_X)
#define _YYYY(r) ureg_scalar(r, TGSI_SWIZZLE_Y)
#define _ZZZZ(r) ureg_scalar(r, TGSI_SWIZZLE_Z)
#define _WWWW(r) ureg_scalar(r, TGSI_SWIZZLE_W)

#define _XYZW(r) (r)

/* AL should contain base address of lights table. */
#define LIGHT_CONST(i)                                                \
    ureg_src_indirect(ureg_DECL_constant(ureg, i), _X(AL))

#define MATERIAL_CONST(i) \
    ureg_DECL_constant(ureg, 19 + (i))

#define _CONST(n) ureg_DECL_constant(ureg, n)

/* VS FF constants layout:
 *
 * CONST[ 0.. 3] D3DTS_WORLD * D3DTS_VIEW * D3DTS_PROJECTION
 * CONST[ 4.. 7] D3DTS_WORLD * D3DTS_VIEW
 * CONST[ 8..11] D3DTS_VIEW * D3DTS_PROJECTION
 * CONST[12..15] D3DTS_VIEW
 * CONST[16..18] Normal matrix
 *
 * CONST[19]      MATERIAL.Emissive + Material.Ambient * RS.Ambient
 * CONST[20]      MATERIAL.Diffuse
 * CONST[21]      MATERIAL.Ambient
 * CONST[22]      MATERIAL.Specular
 * CONST[23].x___ MATERIAL.Power
 * CONST[24]      MATERIAL.Emissive
 * CONST[25]      RS.Ambient
 *
 * CONST[26].x___ RS.PointSizeMin
 * CONST[26]._y__ RS.PointSizeMax
 * CONST[26].__z_ RS.PointSize
 * CONST[26].___w RS.PointScaleA
 * CONST[27].x___ RS.PointScaleB
 * CONST[27]._y__ RS.PointScaleC
 *
 * CONST[28].x___ RS.FogEnd
 * CONST[28]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
 * CONST[28].__z_ RS.FogDensity
 * CONST[29]      RS.FogColor

 * CONST[30].x___ TWEENFACTOR
 *
 * CONST[32].x___ LIGHT[0].Type
 * CONST[32]._yzw LIGHT[0].Attenuation0,1,2
 * CONST[33]      LIGHT[0].Diffuse
 * CONST[34]      LIGHT[0].Specular
 * CONST[35]      LIGHT[0].Ambient
 * CONST[36].xyz_ LIGHT[0].Position
 * CONST[36].___w LIGHT[0].Range
 * CONST[37].xyz_ LIGHT[0].Direction
 * CONST[37].___w LIGHT[0].Falloff
 * CONST[38].x___ cos(LIGHT[0].Theta / 2)
 * CONST[38]._y__ cos(LIGHT[0].Phi / 2)
 * CONST[38].__z_ 1.0f / (cos(LIGHT[0].Theta / 2) - cos(Light[0].Phi / 2))
 * CONST[39].xyz_ LIGHT[0].HalfVector (for directional lights)
 * CONST[39].___w 1 if this is the last active light, 0 if not
 * CONST[40]      LIGHT[1]
 * CONST[48]      LIGHT[2]
 * CONST[56]      LIGHT[3]
 * CONST[64]      LIGHT[4]
 * CONST[72]      LIGHT[5]
 * CONST[80]      LIGHT[6]
 * CONST[88]      LIGHT[7]
 * NOTE: no lighting code is generated if there are no active lights
 *
 * CONST[100].x___ Viewport 2/width
 * CONST[100]._y__ Viewport 2/height
 * CONST[100].__z_ Viewport 1/(zmax - zmin)
 * CONST[101].x___ Viewport x0
 * CONST[101]._y__ Viewport y0
 * CONST[101].__z_ Viewport z0
 *
 * CONST[128..131] D3DTS_TEXTURE0
 * CONST[132..135] D3DTS_TEXTURE1
 * CONST[136..139] D3DTS_TEXTURE2
 * CONST[140..143] D3DTS_TEXTURE3
 * CONST[144..147] D3DTS_TEXTURE4
 * CONST[148..151] D3DTS_TEXTURE5
 * CONST[152..155] D3DTS_TEXTURE6
 * CONST[156..159] D3DTS_TEXTURE7
 *
 * CONST[224] D3DTS_WORLDMATRIX[0]
 * CONST[228] D3DTS_WORLDMATRIX[1]
 * ...
 * CONST[252] D3DTS_WORLDMATRIX[7]
 */
struct vs_build_ctx
{
    struct ureg_program *ureg;
    const struct nine_ff_vs_key *key;

    uint16_t input[PIPE_MAX_ATTRIBS];
    unsigned num_inputs;

    struct ureg_src aVtx;
    struct ureg_src aNrm;
    struct ureg_src aCol[2];
    struct ureg_src aTex[8];
    struct ureg_src aPsz;
    struct ureg_src aInd;
    struct ureg_src aWgt;

    struct ureg_src aVtx1; /* tweening */
    struct ureg_src aNrm1;

    struct ureg_src mtlA;
    struct ureg_src mtlD;
    struct ureg_src mtlS;
    struct ureg_src mtlE;
};

static INLINE unsigned
get_texcoord_sn(struct pipe_screen *screen)
{
    if (screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD))
        return TGSI_SEMANTIC_TEXCOORD;
    return TGSI_SEMANTIC_GENERIC;
}

static INLINE struct ureg_src
build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
{
    const unsigned i = vs->num_inputs++;
    assert(i < PIPE_MAX_ATTRIBS);
    vs->input[i] = ndecl;
    return ureg_DECL_vs_input(vs->ureg, i);
}

/* NOTE: dst may alias src */
static INLINE void
ureg_normalize3(struct ureg_program *ureg,
                struct ureg_dst dst, struct ureg_src src,
                struct ureg_dst tmp)
{
#ifdef NINE_TGSI_LAZY_DEVS
    struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);

    ureg_DP3(ureg, tmp_x, src, src);
    ureg_RSQ(ureg, tmp_x, _X(tmp));
    ureg_MUL(ureg, dst, src, _X(tmp));
#else
    ureg_NRM(ureg, dst, src);
#endif
}

static void *
nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
{
    const struct nine_ff_vs_key *key = vs->key;
    struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_VERTEX);
    struct ureg_dst oPos, oCol[2], oTex[8], oPsz, oFog;
    struct ureg_dst rCol[2]; /* oCol if no fog, TEMP otherwise */
    struct ureg_dst rVtx, rNrm;
    struct ureg_dst r[8];
    struct ureg_dst AR;
    struct ureg_dst tmp, tmp_x, tmp_z;
    unsigned i, c;
    unsigned label[32], l = 0;
    unsigned num_r = 8;
    boolean need_rNrm = key->lighting || key->pointscale;
    boolean need_rVtx = key->lighting || key->fog_mode;
    const unsigned texcoord_sn = get_texcoord_sn(device->screen);

    vs->ureg = ureg;

    /* Check which inputs we should transform. */
    for (i = 0; i < 8 * 3; i += 3) {
        switch ((key->tc_gen >> i) & 0x3) {
        case NINED3DTSS_TCI_CAMERASPACENORMAL:
            need_rNrm = TRUE;
            break;
        case NINED3DTSS_TCI_CAMERASPACEPOSITION:
            need_rVtx = TRUE;
            break;
        case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
            need_rVtx = need_rNrm = TRUE;
            break;
        default:
            break;
        }
    }

    /* Declare and record used inputs (needed for linkage with vertex format):
     * (texture coordinates handled later)
     */
    vs->aVtx = build_vs_add_input(vs,
        key->position_t ? NINE_DECLUSAGE_POSITIONT : NINE_DECLUSAGE_POSITION);

    if (need_rNrm)
        vs->aNrm = build_vs_add_input(vs, NINE_DECLUSAGE_NORMAL);

    vs->aCol[0] = ureg_imm1f(ureg, 1.0f);
    vs->aCol[1] = ureg_imm1f(ureg, 1.0f);

    if (key->lighting || key->darkness) {
        const unsigned mask = key->mtl_diffuse | key->mtl_specular |
                              key->mtl_ambient | key->mtl_emissive;
        if ((mask & 0x1) && !key->color0in_one)
            vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
        if ((mask & 0x2) && !key->color1in_one)
            vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));

        vs->mtlD = MATERIAL_CONST(1);
        vs->mtlA = MATERIAL_CONST(2);
        vs->mtlS = MATERIAL_CONST(3);
        vs->mtlE = MATERIAL_CONST(5);
        if (key->mtl_diffuse  == 1) vs->mtlD = vs->aCol[0]; else
        if (key->mtl_diffuse  == 2) vs->mtlD = vs->aCol[1];
        if (key->mtl_ambient  == 1) vs->mtlA = vs->aCol[0]; else
        if (key->mtl_ambient  == 2) vs->mtlA = vs->aCol[1];
        if (key->mtl_specular == 1) vs->mtlS = vs->aCol[0]; else
        if (key->mtl_specular == 2) vs->mtlS = vs->aCol[1];
        if (key->mtl_emissive == 1) vs->mtlE = vs->aCol[0]; else
        if (key->mtl_emissive == 2) vs->mtlE = vs->aCol[1];
    } else {
        if (!key->color0in_one) vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
        if (!key->color1in_one) vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
    }

    if (key->vertexpointsize)
        vs->aPsz = build_vs_add_input(vs, NINE_DECLUSAGE_PSIZE);

    if (key->vertexblend_indexed)
        vs->aInd = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDINDICES);
    if (key->vertexblend)
        vs->aWgt = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDWEIGHT);
    if (key->vertextween) {
        vs->aVtx1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(POSITION,1));
        vs->aNrm1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(NORMAL,1));
    }

    /* Declare outputs:
     */
    oPos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); /* HPOS */
    oCol[0] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0));
    oCol[1] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1));

    if (key->vertexpointsize || key->pointscale) {
        oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0, TGSI_WRITEMASK_X);
        oPsz = ureg_writemask(oPsz, TGSI_WRITEMASK_X);
    }
    if (key->fog_mode) {
        /* We apply fog to the vertex colors, oFog is for programmable shaders only ?
         */
        oFog = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_FOG, 0, TGSI_WRITEMASK_X);
        oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
    }

    /* Declare TEMPs:
     */
    for (i = 0; i < num_r; ++i)
        r[i] = ureg_DECL_local_temporary(ureg);
    tmp = r[0];
    tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
    tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
    if (key->lighting || key->vertexblend)
        AR = ureg_DECL_address(ureg);

    if (key->fog_mode) {
        rCol[0] = r[2];
        rCol[1] = r[3];
    } else {
        rCol[0] = oCol[0];
        rCol[1] = oCol[1];
    }

    rVtx = ureg_writemask(r[1], TGSI_WRITEMASK_XYZ);
    rNrm = ureg_writemask(r[2], TGSI_WRITEMASK_XYZ);

    /* === Vertex transformation / vertex blending:
     */
    if (key->vertextween) {
        assert(!key->vertexblend);
        ureg_LRP(ureg, r[2], _XXXX(_CONST(30)), vs->aVtx, vs->aVtx1);
        if (need_rNrm)
            ureg_LRP(ureg, r[3], _XXXX(_CONST(30)), vs->aNrm, vs->aNrm1);
        vs->aVtx = ureg_src(r[2]);
        vs->aNrm = ureg_src(r[3]);
    }

    if (key->vertexblend) {
        struct ureg_src cWM[4];

        for (i = 224; i <= 255; ++i)
            ureg_DECL_constant(ureg, i);

        /* translate world matrix index to constant file index */
        if (key->vertexblend_indexed) {
            ureg_MAD(ureg, tmp, vs->aInd, ureg_imm1f(ureg, 4.0f), ureg_imm1f(ureg, 224.0f));
            ureg_ARL(ureg, AR, ureg_src(tmp));
        }
        for (i = 0; i < key->vertexblend; ++i) {
            for (c = 0; c < 4; ++c) {
                cWM[c] = ureg_src_register(TGSI_FILE_CONSTANT, (224 + i * 4) * !key->vertexblend_indexed + c);
                if (key->vertexblend_indexed)
                    cWM[c] = ureg_src_indirect(cWM[c], ureg_scalar(ureg_src(AR), i));
            }
            /* multiply by WORLD(index) */
            ureg_MUL(ureg, r[0], _XXXX(vs->aVtx), cWM[0]);
            ureg_MAD(ureg, r[0], _YYYY(vs->aVtx), cWM[1], ureg_src(r[0]));
            ureg_MAD(ureg, r[0], _ZZZZ(vs->aVtx), cWM[2], ureg_src(r[0]));
            ureg_MAD(ureg, r[0], _WWWW(vs->aVtx), cWM[3], ureg_src(r[0]));

            /* accumulate weighted position value */
            if (i)
                ureg_MAD(ureg, r[2], ureg_src(r[0]), ureg_scalar(vs->aWgt, i), ureg_src(r[2]));
            else
                ureg_MUL(ureg, r[2], ureg_src(r[0]), ureg_scalar(vs->aWgt, 0));
        }
        /* multiply by VIEW_PROJ */
        ureg_MUL(ureg, r[0], _X(r[2]), _CONST(8));
        ureg_MAD(ureg, r[0], _Y(r[2]), _CONST(9),  ureg_src(r[0]));
        ureg_MAD(ureg, r[0], _Z(r[2]), _CONST(10), ureg_src(r[0]));
        ureg_MAD(ureg, oPos, _W(r[2]), _CONST(11), ureg_src(r[0]));

        if (need_rVtx)
            vs->aVtx = ureg_src(r[2]);
    } else
    if (key->position_t && device->driver_caps.window_space_position_support) {
        ureg_MOV(ureg, oPos, vs->aVtx);
    } else if (key->position_t) {
        /* vs->aVtx contains the coordinates buffer wise.
        * later in the pipeline, clipping, viewport and division
        * by w (rhw = 1/w) are going to be applied, so do the reverse
        * of these transformations (except clipping) to have the good
        * position at the end.*/
        ureg_MOV(ureg, tmp, vs->aVtx);
        /* X from [X_min, X_min + width] to [-1, 1], same for Y. Z to [0, 1] */
        ureg_SUB(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _CONST(101));
        ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _CONST(100));
        ureg_SUB(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, 1.0f));
        /* Y needs to be reversed */
        ureg_MOV(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_negate(ureg_src(tmp)));
        /* inverse rhw */
        ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W), _W(tmp));
        /* multiply X, Y, Z by w */
        ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _W(tmp));
        ureg_MOV(ureg, oPos, ureg_src(tmp));
    } else {
        /* position = vertex * WORLD_VIEW_PROJ */
        ureg_MUL(ureg, r[0], _XXXX(vs->aVtx), _CONST(0));
        ureg_MAD(ureg, r[0], _YYYY(vs->aVtx), _CONST(1), ureg_src(r[0]));
        ureg_MAD(ureg, r[0], _ZZZZ(vs->aVtx), _CONST(2), ureg_src(r[0]));
        ureg_MAD(ureg, oPos, _WWWW(vs->aVtx), _CONST(3), ureg_src(r[0]));
    }

    if (need_rVtx) {
        ureg_MUL(ureg, rVtx, _XXXX(vs->aVtx), _CONST(4));
        ureg_MAD(ureg, rVtx, _YYYY(vs->aVtx), _CONST(5), ureg_src(rVtx));
        ureg_MAD(ureg, rVtx, _ZZZZ(vs->aVtx), _CONST(6), ureg_src(rVtx));
        ureg_MAD(ureg, rVtx, _WWWW(vs->aVtx), _CONST(7), ureg_src(rVtx));
    }
    if (need_rNrm) {
        ureg_MUL(ureg, rNrm, _XXXX(vs->aNrm), _CONST(16));
        ureg_MAD(ureg, rNrm, _YYYY(vs->aNrm), _CONST(17), ureg_src(rNrm));
        ureg_MAD(ureg, rNrm, _ZZZZ(vs->aNrm), _CONST(18), ureg_src(rNrm));
        ureg_normalize3(ureg, rNrm, ureg_src(rNrm), tmp);
    }
    /* NOTE: don't use vs->aVtx, vs->aNrm after this line */

    /* === Process point size:
     */
    if (key->vertexpointsize) {
        struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
#ifdef NINE_TGSI_LAZY_DEVS
        struct ureg_dst tmp_clamp = ureg_DECL_temporary(ureg);

        ureg_MAX(ureg, tmp_clamp, vs->aPsz, _XXXX(cPsz1));
        ureg_MIN(ureg, oPsz, ureg_src(tmp_clamp), _YYYY(cPsz1));
        ureg_release_temporary(ureg, tmp_clamp);
#else
        ureg_CLAMP(ureg, oPsz, vs->aPsz, _XXXX(cPsz1), _YYYY(cPsz1));
#endif
    } else if (key->pointscale) {
        struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
        struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
        struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
        struct ureg_src cPsz2 = ureg_DECL_constant(ureg, 27);

        ureg_DP3(ureg, tmp_x, ureg_src(r[1]), ureg_src(r[1]));
        ureg_SQRT(ureg, tmp_y, _X(tmp));
        ureg_MAD(ureg, tmp_x, _Y(tmp), _YYYY(cPsz2), _XXXX(cPsz2));
        ureg_MAD(ureg, tmp_x, _Y(tmp), _X(tmp), _WWWW(cPsz1));
        ureg_RCP(ureg, tmp_x, ureg_src(tmp));
        ureg_MUL(ureg, tmp_x, ureg_src(tmp), _ZZZZ(cPsz1));
#ifdef NINE_TGSI_LAZY_DEVS
        struct ureg_dst tmp_clamp = ureg_DECL_temporary(ureg);

        ureg_MAX(ureg, tmp_clamp, _X(tmp), _XXXX(cPsz1));
        ureg_MIN(ureg, oPsz, ureg_src(tmp_clamp), _YYYY(cPsz1));
        ureg_release_temporary(ureg, tmp_clamp);
#else
        ureg_CLAMP(ureg, oPsz, _X(tmp), _XXXX(cPsz1), _YYYY(cPsz1));
#endif
    }

    /* Texture coordinate generation:
     * XXX: D3DTTFF_PROJECTED, transform matrix
     */
    for (i = 0; i < 8; ++i) {
        struct ureg_dst dst[5];
        struct ureg_src src;
        unsigned c;
        const unsigned tci = (key->tc_gen >> (i * 3)) & 0x7;
        const unsigned idx = (key->tc_idx >> (i * 3)) & 0x7;
        const unsigned dim = (key->tc_dim >> (i * 3)) & 0x7;

        if (tci == NINED3DTSS_TCI_DISABLE)
            continue;
        oTex[i] = ureg_DECL_output(ureg, texcoord_sn, i);

        if (tci == NINED3DTSS_TCI_PASSTHRU)
            vs->aTex[idx] = build_vs_add_input(vs, NINE_DECLUSAGE_i(TEXCOORD,idx));

        if (!dim) {
            dst[c = 4] = oTex[i];
        } else {
            dst[4] = r[5];
            src = ureg_src(dst[4]);
            for (c = 0; c < (dim - 1); ++c)
                dst[c] = ureg_writemask(tmp, (1 << dim) - 1);
            dst[c] = ureg_writemask(oTex[i], (1 << dim) - 1);
        }

        switch (tci) {
        case NINED3DTSS_TCI_PASSTHRU:
            ureg_MOV(ureg, dst[4], vs->aTex[idx]);
            break;
        case NINED3DTSS_TCI_CAMERASPACENORMAL:
            assert(dim <= 3);
            ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_XYZ), ureg_src(rNrm));
            ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
            break;
        case NINED3DTSS_TCI_CAMERASPACEPOSITION:
            ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_XYZ), ureg_src(rVtx));
            ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
            break;
        case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
            tmp.WriteMask = TGSI_WRITEMASK_XYZ;
            ureg_DP3(ureg, tmp_x, ureg_src(rVtx), ureg_src(rNrm));
            ureg_MUL(ureg, tmp, ureg_src(rNrm), _X(tmp));
            ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
            ureg_SUB(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_XYZ), ureg_src(rVtx), ureg_src(tmp));
            ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
            tmp.WriteMask = TGSI_WRITEMASK_XYZW;
            break;
        case NINED3DTSS_TCI_SPHEREMAP:
            assert(!"TODO");
            break;
        default:
            break;
        }
        if (!dim)
            continue;
        dst[c].WriteMask = ~dst[c].WriteMask;
        if (dst[c].WriteMask)
            ureg_MOV(ureg, dst[c], src); /* store untransformed components */
        dst[c].WriteMask = ~dst[c].WriteMask;
        if (dim > 0) ureg_MUL(ureg, dst[0], _XXXX(src), _CONST(128 + i * 4));
        if (dim > 1) ureg_MAD(ureg, dst[1], _YYYY(src), _CONST(129 + i * 4), ureg_src(tmp));
        if (dim > 2) ureg_MAD(ureg, dst[2], _ZZZZ(src), _CONST(130 + i * 4), ureg_src(tmp));
        if (dim > 3) ureg_MAD(ureg, dst[3], _WWWW(src), _CONST(131 + i * 4), ureg_src(tmp));
    }

    /* === Lighting:
     *
     * DIRECTIONAL:  Light at infinite distance, parallel rays, no attenuation.
     * POINT: Finite distance to scene, divergent rays, isotropic, attenuation.
     * SPOT: Finite distance, divergent rays, angular dependence, attenuation.
     *
     * vec3 normal = normalize(in.Normal * NormalMatrix);
     * vec3 hitDir = light.direction;
     * float atten = 1.0;
     *
     * if (light.type != DIRECTIONAL)
     * {
     *     vec3 hitVec = light.position - eyeVertex;
     *     float d = length(hitVec);
     *     hitDir = hitVec / d;
     *     atten = 1 / ((light.atten2 * d + light.atten1) * d + light.atten0);
     * }
     *
     * if (light.type == SPOTLIGHT)
     * {
     *     float rho = dp3(-hitVec, light.direction);
     *     if (rho < cos(light.phi / 2))
     *         atten = 0;
     *     if (rho < cos(light.theta / 2))
     *         atten *= pow(some_func(rho), light.falloff);
     * }
     *
     * float nDotHit = dp3_sat(normal, hitVec);
     * float powFact = 0.0;
     *
     * if (nDotHit > 0.0)
     * {
     *     vec3 midVec = normalize(hitDir + eye);
     *     float nDotMid = dp3_sat(normal, midVec);
     *     pFact = pow(nDotMid, material.power);
     * }
     *
     * ambient += light.ambient * atten;
     * diffuse += light.diffuse * atten * nDotHit;
     * specular += light.specular * atten * powFact;
     */
    if (key->lighting) {
        struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);

        struct ureg_dst rAtt = ureg_writemask(r[1], TGSI_WRITEMASK_W);
        struct ureg_dst rHit = ureg_writemask(r[3], TGSI_WRITEMASK_XYZ);
        struct ureg_dst rMid = ureg_writemask(r[4], TGSI_WRITEMASK_XYZ);

        struct ureg_dst rCtr = ureg_writemask(r[2], TGSI_WRITEMASK_W);

        struct ureg_dst AL = ureg_writemask(AR, TGSI_WRITEMASK_X);

        /* Light.*.Alpha is not used. */
        struct ureg_dst rD = ureg_writemask(r[5], TGSI_WRITEMASK_XYZ);
        struct ureg_dst rA = ureg_writemask(r[6], TGSI_WRITEMASK_XYZ);
        struct ureg_dst rS = ureg_writemask(r[7], TGSI_WRITEMASK_XYZ);

        struct ureg_src mtlP = _XXXX(MATERIAL_CONST(4));

        struct ureg_src cLKind = _XXXX(LIGHT_CONST(0));
        struct ureg_src cLAtt0 = _YYYY(LIGHT_CONST(0));
        struct ureg_src cLAtt1 = _ZZZZ(LIGHT_CONST(0));
        struct ureg_src cLAtt2 = _WWWW(LIGHT_CONST(0));
        struct ureg_src cLColD = _XYZW(LIGHT_CONST(1));
        struct ureg_src cLColS = _XYZW(LIGHT_CONST(2));
        struct ureg_src cLColA = _XYZW(LIGHT_CONST(3));
        struct ureg_src cLPos  = _XYZW(LIGHT_CONST(4));
        struct ureg_src cLRng  = _WWWW(LIGHT_CONST(4));
        struct ureg_src cLDir  = _XYZW(LIGHT_CONST(5));
        struct ureg_src cLFOff = _WWWW(LIGHT_CONST(5));
        struct ureg_src cLTht  = _XXXX(LIGHT_CONST(6));
        struct ureg_src cLPhi  = _YYYY(LIGHT_CONST(6));
        struct ureg_src cLSDiv = _ZZZZ(LIGHT_CONST(6));
        struct ureg_src cLLast = _WWWW(LIGHT_CONST(7));

        const unsigned loop_label = l++;

        ureg_MOV(ureg, rCtr, ureg_imm1f(ureg, 32.0f)); /* &lightconst(0) */
        ureg_MOV(ureg, rD, ureg_imm1f(ureg, 0.0f));
        ureg_MOV(ureg, rA, ureg_imm1f(ureg, 0.0f));
        ureg_MOV(ureg, rS, ureg_imm1f(ureg, 0.0f));
        rD = ureg_saturate(rD);
        rA = ureg_saturate(rA);
        rS = ureg_saturate(rS);


        /* loop management */
        ureg_BGNLOOP(ureg, &label[loop_label]);
        ureg_ARL(ureg, AL, _W(rCtr));

        /* if (not DIRECTIONAL light): */
        ureg_SNE(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_DIRECTIONAL));
        ureg_MOV(ureg, rHit, ureg_negate(cLDir));
        ureg_MOV(ureg, rAtt, ureg_imm1f(ureg, 1.0f));
        ureg_IF(ureg, _X(tmp), &label[l++]);
        {
            /* hitDir = light.position - eyeVtx
             * d = length(hitDir)
             * hitDir /= d
             */
            ureg_SUB(ureg, rHit, cLPos, ureg_src(rVtx));
            ureg_DP3(ureg, tmp_x, ureg_src(rHit), ureg_src(rHit));
            ureg_RSQ(ureg, tmp_y, _X(tmp));
            ureg_MUL(ureg, rHit, ureg_src(rHit), _Y(tmp)); /* normalize */
            ureg_MUL(ureg, tmp_x, _X(tmp), _Y(tmp)); /* length */

            /* att = 1.0 / (light.att0 + (light.att1 + light.att2 * d) * d) */
            ureg_MAD(ureg, rAtt, _X(tmp), cLAtt2, cLAtt1);
            ureg_MAD(ureg, rAtt, _X(tmp), _W(rAtt), cLAtt0);
            ureg_RCP(ureg, rAtt, _W(rAtt));
            /* cut-off if distance exceeds Light.Range */
            ureg_SLT(ureg, tmp_x, _X(tmp), cLRng);
            ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
        }
        ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
        ureg_ENDIF(ureg);

        /* if (SPOT light) */
        ureg_SEQ(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_SPOT));
        ureg_IF(ureg, _X(tmp), &label[l++]);
        {
            /* rho = dp3(-hitDir, light.spotDir)
             *
             * if (rho  > light.ctht2) NOTE: 0 <= phi <= pi, 0 <= theta <= phi
             *     spotAtt = 1
             * else
             * if (rho <= light.cphi2)
             *     spotAtt = 0
             * else
             *     spotAtt = (rho - light.cphi2) / (light.ctht2 - light.cphi2) ^ light.falloff
             */
            ureg_DP3(ureg, tmp_y, ureg_negate(ureg_src(rHit)), cLDir); /* rho */
            ureg_SUB(ureg, tmp_x, _Y(tmp), cLPhi);
            ureg_MUL(ureg, tmp_x, _X(tmp), cLSDiv);
            ureg_POW(ureg, tmp_x, _X(tmp), cLFOff); /* spotAtten */
            ureg_SGE(ureg, tmp_z, _Y(tmp), cLTht); /* if inside theta && phi */
            ureg_SGE(ureg, tmp_y, _Y(tmp), cLPhi); /* if inside phi */
            ureg_MAD(ureg, ureg_saturate(tmp_x), _X(tmp), _Y(tmp), _Z(tmp));
            ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
        }
        ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
        ureg_ENDIF(ureg);

        /* directional factors, let's not use LIT because of clarity */
        ureg_DP3(ureg, ureg_saturate(tmp_x), ureg_src(rNrm), ureg_src(rHit));
        ureg_MOV(ureg, tmp_y, ureg_imm1f(ureg, 0.0f));
        ureg_IF(ureg, _X(tmp), &label[l++]);
        {
            /* midVec = normalize(hitDir + eyeDir) */
            if (key->localviewer) {
                ureg_normalize3(ureg, rMid, ureg_src(rVtx), tmp);
                ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_negate(ureg_src(rMid)));
            } else {
                ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_imm3f(ureg, 0.0f, 0.0f, 1.0f));
            }
            ureg_normalize3(ureg, rMid, ureg_src(rMid), tmp);
            ureg_DP3(ureg, ureg_saturate(tmp_y), ureg_src(rNrm), ureg_src(rMid));
            ureg_POW(ureg, tmp_y, _Y(tmp), mtlP);

            ureg_MUL(ureg, tmp_x, _W(rAtt), _X(tmp)); /* dp3(normal,hitDir) * att */
            ureg_MUL(ureg, tmp_y, _W(rAtt), _Y(tmp)); /* power factor * att */
            ureg_MAD(ureg, rD, cLColD, _X(tmp), ureg_src(rD)); /* accumulate diffuse */
            ureg_MAD(ureg, rS, cLColS, _Y(tmp), ureg_src(rS)); /* accumulate specular */
        }
        ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
        ureg_ENDIF(ureg);

        ureg_MAD(ureg, rA, cLColA, _W(rAtt), ureg_src(rA)); /* accumulate ambient */

        /* break if this was the last light */
        ureg_IF(ureg, cLLast, &label[l++]);
        ureg_BRK(ureg);
        ureg_ENDIF(ureg);
        ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));

        ureg_ADD(ureg, rCtr, _W(rCtr), ureg_imm1f(ureg, 8.0f));
        ureg_fixup_label(ureg, label[loop_label], ureg_get_instruction_number(ureg));
        ureg_ENDLOOP(ureg, &label[loop_label]);

        /* Set alpha factors of illumination to 1.0 for the multiplications. */
        rD.WriteMask = TGSI_WRITEMASK_W; rD.Saturate = 0;
        rS.WriteMask = TGSI_WRITEMASK_W; rS.Saturate = 0;
        rA.WriteMask = TGSI_WRITEMASK_W; rA.Saturate = 0;
        ureg_MOV(ureg, rD, ureg_imm1f(ureg, 1.0f));
        ureg_MOV(ureg, rS, ureg_imm1f(ureg, 1.0f));

        /* Apply to material:
         *
         * oCol[0] = (material.emissive + material.ambient * rs.ambient) +
         *           material.ambient * ambient +
         *           material.diffuse * diffuse +
         * oCol[1] = material.specular * specular;
         */
        if (key->mtl_emissive == 0 && key->mtl_ambient == 0) {
            ureg_MOV(ureg, rA, ureg_imm1f(ureg, 1.0f));
            ureg_MAD(ureg, tmp, ureg_src(rA), vs->mtlA, _CONST(19));
        } else {
            ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), _CONST(25));
            ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), vs->mtlA, ureg_src(tmp), vs->mtlE);
            ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W  ), vs->mtlA, vs->mtlE);
        }
        ureg_MAD(ureg, rCol[0], ureg_src(rD), vs->mtlD, ureg_src(tmp));
        ureg_MUL(ureg, rCol[1], ureg_src(rS), vs->mtlS);
    } else
    /* COLOR */
    if (key->darkness) {
        if (key->mtl_emissive == 0 && key->mtl_ambient == 0) {
            ureg_MAD(ureg, rCol[0], vs->mtlD, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f), _CONST(19));
        } else {
            ureg_MAD(ureg, ureg_writemask(rCol[0], TGSI_WRITEMASK_XYZ), vs->mtlA, _CONST(25), vs->mtlE);
            ureg_ADD(ureg, ureg_writemask(tmp,     TGSI_WRITEMASK_W), vs->mtlA, vs->mtlE);
            ureg_ADD(ureg, ureg_writemask(rCol[0], TGSI_WRITEMASK_W), vs->mtlD, _W(tmp));
        }
        ureg_MUL(ureg, rCol[1], ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f), vs->mtlS);
    } else {
        ureg_MOV(ureg, rCol[0], vs->aCol[0]);
        ureg_MOV(ureg, rCol[1], vs->aCol[1]);
    }

    /* === Process fog.
     *
     * exp(x) = ex2(log2(e) * x)
     */
    if (key->fog_mode) {
        /* Fog doesn't affect alpha, TODO: combine with light code output */
        ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), _W(rCol[0]));
        ureg_MOV(ureg, ureg_writemask(oCol[1], TGSI_WRITEMASK_W), _W(rCol[1]));

        if (key->position_t) {
            ureg_MOV(ureg, ureg_saturate(tmp_x), ureg_scalar(vs->aCol[1], TGSI_SWIZZLE_W));
        } else
        if (key->fog_range) {
            ureg_DP3(ureg, tmp_x, ureg_src(rVtx), ureg_src(rVtx));
            ureg_RSQ(ureg, tmp_z, _X(tmp));
            ureg_MUL(ureg, tmp_z, _Z(tmp), _X(tmp));
        } else {
            ureg_MOV(ureg, tmp_z, ureg_abs(_Z(rVtx)));
        }

        if (key->fog_mode == D3DFOG_EXP) {
            ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
            ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
            ureg_EX2(ureg, tmp_x, _X(tmp));
        } else
        if (key->fog_mode == D3DFOG_EXP2) {
            ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
            ureg_MUL(ureg, tmp_x, _X(tmp), _X(tmp));
            ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
            ureg_EX2(ureg, tmp_x, _X(tmp));
        } else
        if (key->fog_mode == D3DFOG_LINEAR && !key->position_t) {
            ureg_SUB(ureg, tmp_x, _XXXX(_CONST(28)), _Z(tmp));
            ureg_MUL(ureg, ureg_saturate(tmp_x), _X(tmp), _YYYY(_CONST(28)));
        }
        ureg_MOV(ureg, oFog, _X(tmp));
        ureg_LRP(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), _X(tmp), ureg_src(rCol[0]), _CONST(29));
        ureg_LRP(ureg, ureg_writemask(oCol[1], TGSI_WRITEMASK_XYZ), _X(tmp), ureg_src(rCol[1]), _CONST(29));
    }

    if (key->position_t && device->driver_caps.window_space_position_support)
        ureg_property_vs_window_space_position(ureg, TRUE);

    ureg_END(ureg);
    nine_ureg_tgsi_dump(ureg, FALSE);
    return ureg_create_shader_and_destroy(ureg, device->pipe);
}

/* PS FF constants layout:
 *
 * CONST[ 0.. 7]      stage[i].D3DTSS_CONSTANT
 * CONST[ 8..15].x___ stage[i].D3DTSS_BUMPENVMAT00
 * CONST[ 8..15]._y__ stage[i].D3DTSS_BUMPENVMAT01
 * CONST[ 8..15].__z_ stage[i].D3DTSS_BUMPENVMAT10
 * CONST[ 8..15].___w stage[i].D3DTSS_BUMPENVMAT11
 * CONST[16..19].x_z_ stage[i].D3DTSS_BUMPENVLSCALE
 * CONST[17..19]._y_w stage[i].D3DTSS_BUMPENVLOFFSET
 *
 * CONST[20] D3DRS_TEXTUREFACTOR
 * CONST[21] D3DRS_FOGCOLOR
 * CONST[22].x___ RS.FogEnd
 * CONST[22]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
 * CONST[22].__z_ RS.FogDensity
 */
struct ps_build_ctx
{
    struct ureg_program *ureg;

    struct ureg_src vC[2]; /* DIFFUSE, SPECULAR */
    struct ureg_src vT[8]; /* TEXCOORD[i] */
    struct ureg_dst r[6];  /* TEMPs */
    struct ureg_dst rCur; /* D3DTA_CURRENT */
    struct ureg_dst rMod;
    struct ureg_src rCurSrc;
    struct ureg_dst rTmp; /* D3DTA_TEMP */
    struct ureg_src rTmpSrc;
    struct ureg_dst rTex;
    struct ureg_src rTexSrc;
    struct ureg_src cBEM[8];
    struct ureg_src s[8];

    struct {
        unsigned index;
        unsigned index_pre_mod;
        unsigned num_regs;
    } stage;
};

static struct ureg_src
ps_get_ts_arg(struct ps_build_ctx *ps, unsigned ta)
{
    struct ureg_src reg;

    switch (ta & D3DTA_SELECTMASK) {
    case D3DTA_CONSTANT:
        reg = ureg_DECL_constant(ps->ureg, ps->stage.index);
        break;
    case D3DTA_CURRENT:
        reg = (ps->stage.index == ps->stage.index_pre_mod) ? ureg_src(ps->rMod) : ps->rCurSrc;
        break;
    case D3DTA_DIFFUSE:
        reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_PERSPECTIVE);
        break;
    case D3DTA_SPECULAR:
        reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);
        break;
    case D3DTA_TEMP:
        reg = ps->rTmpSrc;
        break;
    case D3DTA_TEXTURE:
        reg = ps->rTexSrc;
        break;
    case D3DTA_TFACTOR:
        reg = ureg_DECL_constant(ps->ureg, 20);
        break;
    default:
        assert(0);
        reg = ureg_src_undef();
        break;
    }
    if (ta & D3DTA_COMPLEMENT) {
        struct ureg_dst dst = ps->r[ps->stage.num_regs++];
        ureg_SUB(ps->ureg, dst, ureg_imm1f(ps->ureg, 1.0f), reg);
        reg = ureg_src(dst);
    }
    if (ta & D3DTA_ALPHAREPLICATE)
        reg = _WWWW(reg);
    return reg;
}

static struct ureg_dst
ps_get_ts_dst(struct ps_build_ctx *ps, unsigned ta)
{
    assert(!(ta & (D3DTA_COMPLEMENT | D3DTA_ALPHAREPLICATE)));

    switch (ta & D3DTA_SELECTMASK) {
    case D3DTA_CURRENT:
        return ps->rCur;
    case D3DTA_TEMP:
        return ps->rTmp;
    default:
        assert(0);
        return ureg_dst_undef();
    }
}

static uint8_t ps_d3dtop_args_mask(D3DTEXTUREOP top)
{
    switch (top) {
    case D3DTOP_DISABLE:
        return 0x0;
    case D3DTOP_SELECTARG1:
    case D3DTOP_PREMODULATE:
        return 0x2;
    case D3DTOP_SELECTARG2:
        return 0x4;
    case D3DTOP_MULTIPLYADD:
    case D3DTOP_LERP:
        return 0x7;
    default:
        return 0x6;
    }
}

static INLINE boolean
is_MOV_no_op(struct ureg_dst dst, struct ureg_src src)
{
    return !dst.WriteMask ||
        (dst.File == src.File &&
         dst.Index == src.Index &&
         !dst.Indirect &&
         !dst.Saturate &&
         !src.Indirect &&
         !src.Negate &&
         !src.Absolute &&
         (!(dst.WriteMask & TGSI_WRITEMASK_X) || (src.SwizzleX == TGSI_SWIZZLE_X)) &&
         (!(dst.WriteMask & TGSI_WRITEMASK_Y) || (src.SwizzleY == TGSI_SWIZZLE_Y)) &&
         (!(dst.WriteMask & TGSI_WRITEMASK_Z) || (src.SwizzleZ == TGSI_SWIZZLE_Z)) &&
         (!(dst.WriteMask & TGSI_WRITEMASK_W) || (src.SwizzleW == TGSI_SWIZZLE_W)));

}

static void
ps_do_ts_op(struct ps_build_ctx *ps, unsigned top, struct ureg_dst dst, struct ureg_src *arg)
{
    struct ureg_program *ureg = ps->ureg;
    struct ureg_dst tmp = ps->r[ps->stage.num_regs];
    struct ureg_dst tmp2 = ps->r[ps->stage.num_regs+1];
    struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);

    tmp.WriteMask = dst.WriteMask;

    if (top != D3DTOP_SELECTARG1 && top != D3DTOP_SELECTARG2 &&
        top != D3DTOP_MODULATE && top != D3DTOP_PREMODULATE &&
        top != D3DTOP_BLENDDIFFUSEALPHA && top != D3DTOP_BLENDTEXTUREALPHA &&
        top != D3DTOP_BLENDFACTORALPHA && top != D3DTOP_BLENDCURRENTALPHA &&
        top != D3DTOP_BUMPENVMAP && top != D3DTOP_BUMPENVMAPLUMINANCE &&
        top != D3DTOP_LERP)
        dst = ureg_saturate(dst);

    switch (top) {
    case D3DTOP_SELECTARG1:
        if (!is_MOV_no_op(dst, arg[1]))
            ureg_MOV(ureg, dst, arg[1]);
        break;
    case D3DTOP_SELECTARG2:
        if (!is_MOV_no_op(dst, arg[2]))
            ureg_MOV(ureg, dst, arg[2]);
        break;
    case D3DTOP_MODULATE:
        ureg_MUL(ureg, dst, arg[1], arg[2]);
        break;
    case D3DTOP_MODULATE2X:
        ureg_MUL(ureg, tmp, arg[1], arg[2]);
        ureg_ADD(ureg, dst, ureg_src(tmp), ureg_src(tmp));
        break;
    case D3DTOP_MODULATE4X:
        ureg_MUL(ureg, tmp, arg[1], arg[2]);
        ureg_MUL(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 4.0f));
        break;
    case D3DTOP_ADD:
        ureg_ADD(ureg, dst, arg[1], arg[2]);
        break;
    case D3DTOP_ADDSIGNED:
        ureg_ADD(ureg, tmp, arg[1], arg[2]);
        ureg_SUB(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 0.5f));
        break;
    case D3DTOP_ADDSIGNED2X:
        ureg_ADD(ureg, tmp, arg[1], arg[2]);
        ureg_MAD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 2.0f), ureg_imm1f(ureg, -1.0f));
        break;
    case D3DTOP_SUBTRACT:
        ureg_SUB(ureg, dst, arg[1], arg[2]);
        break;
    case D3DTOP_ADDSMOOTH:
        ureg_SUB(ureg, tmp, ureg_imm1f(ureg, 1.0f), arg[1]);
        ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], arg[1]);
        break;
    case D3DTOP_BLENDDIFFUSEALPHA:
        ureg_LRP(ureg, dst, _WWWW(ps->vC[0]), arg[1], arg[2]);
        break;
    case D3DTOP_BLENDTEXTUREALPHA:
        /* XXX: alpha taken from previous stage, texture or result ? */
        ureg_LRP(ureg, dst, _W(ps->rTex), arg[1], arg[2]);
        break;
    case D3DTOP_BLENDFACTORALPHA:
        ureg_LRP(ureg, dst, _WWWW(_CONST(20)), arg[1], arg[2]);
        break;
    case D3DTOP_BLENDTEXTUREALPHAPM:
        ureg_SUB(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), _W(ps->rTex));
        ureg_MAD(ureg, dst, arg[2], _X(tmp), arg[1]);
        break;
    case D3DTOP_BLENDCURRENTALPHA:
        ureg_LRP(ureg, dst, _WWWW(ps->rCurSrc), arg[1], arg[2]);
        break;
    case D3DTOP_PREMODULATE:
        ureg_MOV(ureg, dst, arg[1]);
        ps->stage.index_pre_mod = ps->stage.index + 1;
        break;
    case D3DTOP_MODULATEALPHA_ADDCOLOR:
        ureg_MAD(ureg, dst, _WWWW(arg[1]), arg[2], arg[1]);
        break;
    case D3DTOP_MODULATECOLOR_ADDALPHA:
        ureg_MAD(ureg, dst, arg[1], arg[2], _WWWW(arg[1]));
        break;
    case D3DTOP_MODULATEINVALPHA_ADDCOLOR:
        ureg_SUB(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), _WWWW(arg[1]));
        ureg_MAD(ureg, dst, _X(tmp), arg[2], arg[1]);
        break;
    case D3DTOP_MODULATEINVCOLOR_ADDALPHA:
        ureg_SUB(ureg, tmp, ureg_imm1f(ureg, 1.0f), arg[1]);
        ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], _WWWW(arg[1]));
        break;
    case D3DTOP_BUMPENVMAP:
        break;
    case D3DTOP_BUMPENVMAPLUMINANCE:
        break;
    case D3DTOP_DOTPRODUCT3:
        ureg_SUB(ureg, tmp, arg[1], ureg_imm4f(ureg,0.5,0.5,0.5,0.5));
        ureg_SUB(ureg, tmp2, arg[2] , ureg_imm4f(ureg,0.5,0.5,0.5,0.5));
        ureg_DP3(ureg, tmp, ureg_src(tmp), ureg_src(tmp2));
        ureg_MUL(ureg, ureg_saturate(dst), ureg_src(tmp), ureg_imm4f(ureg,4.0,4.0,4.0,4.0));
        break;
    case D3DTOP_MULTIPLYADD:
        ureg_MAD(ureg, dst, arg[2], arg[0], arg[1]);
        break;
    case D3DTOP_LERP:
        ureg_LRP(ureg, dst, arg[1], arg[2], arg[0]);
        break;
    case D3DTOP_DISABLE:
        /* no-op ? */
        break;
    default:
        assert(!"invalid D3DTOP");
        break;
    }
}

static void *
nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
{
    struct ps_build_ctx ps;
    struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
    struct ureg_dst oCol;
    unsigned i, s;
    const unsigned texcoord_sn = get_texcoord_sn(device->screen);

    memset(&ps, 0, sizeof(ps));
    ps.ureg = ureg;
    ps.stage.index_pre_mod = -1;

    ps.vC[0] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_PERSPECTIVE);

    /* Declare all TEMPs we might need, serious drivers have a register allocator. */
    for (i = 0; i < Elements(ps.r); ++i)
        ps.r[i] = ureg_DECL_local_temporary(ureg);
    ps.rCur = ps.r[0];
    ps.rTmp = ps.r[1];
    ps.rTex = ps.r[2];
    ps.rCurSrc = ureg_src(ps.rCur);
    ps.rTmpSrc = ureg_src(ps.rTmp);
    ps.rTexSrc = ureg_src(ps.rTex);

    for (s = 0; s < 8; ++s) {
        ps.s[s] = ureg_src_undef();

        if (key->ts[s].colorop != D3DTOP_DISABLE) {
            if (key->ts[s].colorarg0 == D3DTA_SPECULAR ||
                key->ts[s].colorarg1 == D3DTA_SPECULAR ||
                key->ts[s].colorarg2 == D3DTA_SPECULAR)
                ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);

            if (key->ts[s].colorarg0 == D3DTA_TEXTURE ||
                key->ts[s].colorarg1 == D3DTA_TEXTURE ||
                key->ts[s].colorarg2 == D3DTA_TEXTURE) {
                ps.s[s] = ureg_DECL_sampler(ureg, s);
                ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
            }
            if (s && (key->ts[s - 1].colorop == D3DTOP_PREMODULATE ||
                      key->ts[s - 1].alphaop == D3DTOP_PREMODULATE))
                ps.s[s] = ureg_DECL_sampler(ureg, s);
        }

        if (key->ts[s].alphaop != D3DTOP_DISABLE) {
            if (key->ts[s].alphaarg0 == D3DTA_SPECULAR ||
                key->ts[s].alphaarg1 == D3DTA_SPECULAR ||
                key->ts[s].alphaarg2 == D3DTA_SPECULAR)
                ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);

            if (key->ts[s].alphaarg0 == D3DTA_TEXTURE ||
                key->ts[s].alphaarg1 == D3DTA_TEXTURE ||
                key->ts[s].alphaarg2 == D3DTA_TEXTURE) {
                ps.s[s] = ureg_DECL_sampler(ureg, s);
                ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
            }
        }
    }
    if (key->specular)
        ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);

    oCol = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);

    if (key->ts[0].colorop == D3DTOP_DISABLE &&
        key->ts[0].alphaop == D3DTOP_DISABLE)
        ureg_MOV(ureg, ps.rCur, ps.vC[0]);
    /* Or is it undefined then ? */

    /* Run stages.
     */
    for (s = 0; s < 8; ++s) {
        unsigned colorarg[3];
        unsigned alphaarg[3];
        const uint8_t used_c = ps_d3dtop_args_mask(key->ts[s].colorop);
        const uint8_t used_a = ps_d3dtop_args_mask(key->ts[s].alphaop);
        struct ureg_dst dst;
        struct ureg_src arg[3];

        if (key->ts[s].colorop == D3DTOP_DISABLE &&
            key->ts[s].alphaop == D3DTOP_DISABLE)
            continue;
        ps.stage.index = s;
        ps.stage.num_regs = 3;

        DBG("STAGE[%u]: colorop=%s alphaop=%s\n", s,
            nine_D3DTOP_to_str(key->ts[s].colorop),
            nine_D3DTOP_to_str(key->ts[s].alphaop));

        if (!ureg_src_is_undef(ps.s[s])) {
            unsigned target;
            switch (key->ts[s].textarget) {
            case 0: target = TGSI_TEXTURE_1D; break;
            case 1: target = TGSI_TEXTURE_2D; break;
            case 2: target = TGSI_TEXTURE_3D; break;
            case 3: target = TGSI_TEXTURE_CUBE; break;
            /* this is a 2 bit bitfield, do I really need a default case ? */
            }

            /* sample the texture */
            if (key->ts[s].colorop == D3DTOP_BUMPENVMAP ||
                key->ts[s].colorop == D3DTOP_BUMPENVMAPLUMINANCE) {
            }
            if (key->ts[s].projected)
                ureg_TXP(ureg, ps.rTex, target, ps.vT[s], ps.s[s]);
            else
                ureg_TEX(ureg, ps.rTex, target, ps.vT[s], ps.s[s]);
        }

        if (s == 0 &&
            (key->ts[0].resultarg != 0 /* not current */ ||
             key->ts[0].colorop == D3DTOP_DISABLE ||
             key->ts[0].alphaop == D3DTOP_DISABLE ||
             key->ts[0].colorarg0 == D3DTA_CURRENT ||
             key->ts[0].colorarg1 == D3DTA_CURRENT ||
             key->ts[0].colorarg2 == D3DTA_CURRENT ||
             key->ts[0].alphaarg0 == D3DTA_CURRENT ||
             key->ts[0].alphaarg1 == D3DTA_CURRENT ||
             key->ts[0].alphaarg2 == D3DTA_CURRENT)
           ) {
            /* Initialize D3DTA_CURRENT.
             * (Yes we can do this before the loop but not until
             *  NVE4 has an instruction scheduling pass.)
             */
            ureg_MOV(ureg, ps.rCur, ps.vC[0]);
        }

        dst = ps_get_ts_dst(&ps, key->ts[s].resultarg ? D3DTA_TEMP : D3DTA_CURRENT);

        if (ps.stage.index_pre_mod == ps.stage.index) {
            ps.rMod = ps.r[ps.stage.num_regs++];
            ureg_MUL(ureg, ps.rMod, ps.rCurSrc, ps.rTexSrc);
        }

        colorarg[0] = (key->ts[s].colorarg0 | ((key->colorarg_b4[0] >> s) << 4) | ((key->colorarg_b5[0] >> s) << 5)) & 0x3f;
        colorarg[1] = (key->ts[s].colorarg1 | ((key->colorarg_b4[1] >> s) << 4) | ((key->colorarg_b5[1] >> s) << 5)) & 0x3f;
        colorarg[2] = (key->ts[s].colorarg2 | ((key->colorarg_b4[2] >> s) << 4) | ((key->colorarg_b5[2] >> s) << 5)) & 0x3f;
        alphaarg[0] = (key->ts[s].alphaarg0 | ((key->alphaarg_b4[0] >> s) << 4)) & 0x1f;
        alphaarg[1] = (key->ts[s].alphaarg1 | ((key->alphaarg_b4[1] >> s) << 4)) & 0x1f;
        alphaarg[2] = (key->ts[s].alphaarg2 | ((key->alphaarg_b4[2] >> s) << 4)) & 0x1f;

        if (key->ts[s].colorop != key->ts[s].alphaop ||
            colorarg[0] != alphaarg[0] ||
            colorarg[1] != alphaarg[1] ||
            colorarg[2] != alphaarg[2])
            dst.WriteMask = TGSI_WRITEMASK_XYZ;

        if (used_c & 0x1) arg[0] = ps_get_ts_arg(&ps, colorarg[0]);
        if (used_c & 0x2) arg[1] = ps_get_ts_arg(&ps, colorarg[1]);
        if (used_c & 0x4) arg[2] = ps_get_ts_arg(&ps, colorarg[2]);
        ps_do_ts_op(&ps, key->ts[s].colorop, dst, arg);

        if (dst.WriteMask != TGSI_WRITEMASK_XYZW) {
            dst.WriteMask = TGSI_WRITEMASK_W;

            if (used_a & 0x1) arg[0] = ps_get_ts_arg(&ps, alphaarg[0]);
            if (used_a & 0x2) arg[1] = ps_get_ts_arg(&ps, alphaarg[1]);
            if (used_a & 0x4) arg[2] = ps_get_ts_arg(&ps, alphaarg[2]);
            ps_do_ts_op(&ps, key->ts[s].alphaop, dst, arg);
        }
    }

    if (key->specular)
        ureg_ADD(ureg, ps.rCur, ps.rCurSrc, ps.vC[1]);

    /* Fog.
     */
    if (key->fog_mode) {
        struct ureg_src vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0, TGSI_INTERPOLATE_LINEAR);
        struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
        if (key->fog_mode == D3DFOG_EXP) {
            ureg_MUL(ureg, rFog, _ZZZZ(vPos), _ZZZZ(_CONST(22)));
            ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
            ureg_EX2(ureg, rFog, _X(rFog));
        } else
        if (key->fog_mode == D3DFOG_EXP2) {
            ureg_MUL(ureg, rFog, _ZZZZ(vPos), _ZZZZ(_CONST(22)));
            ureg_MUL(ureg, rFog, _X(rFog), _X(rFog));
            ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
            ureg_EX2(ureg, rFog, _X(rFog));
        } else
        if (key->fog_mode == D3DFOG_LINEAR) {
            ureg_SUB(ureg, rFog, _XXXX(_CONST(22)), _ZZZZ(vPos));
            ureg_MUL(ureg, ureg_saturate(rFog), _X(rFog), _YYYY(_CONST(22)));
        }
        ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _X(rFog), ps.rCurSrc, _CONST(21));
        ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
    } else
    if (key->fog) {
        struct ureg_src vFog = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_FOG, 0, TGSI_INTERPOLATE_PERSPECTIVE);
        ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _XXXX(vFog), ps.rCurSrc, _CONST(21));
        ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
    } else {
        ureg_MOV(ureg, oCol, ps.rCurSrc);
    }

    ureg_END(ureg);
    nine_ureg_tgsi_dump(ureg, FALSE);
    return ureg_create_shader_and_destroy(ureg, device->pipe);
}

static struct NineVertexShader9 *
nine_ff_get_vs(struct NineDevice9 *device)
{
    const struct nine_state *state = &device->state;
    struct NineVertexShader9 *vs;
    enum pipe_error err;
    struct vs_build_ctx bld;
    struct nine_ff_vs_key key;
    unsigned s, i;

    assert(sizeof(key) <= sizeof(key.value32));

    memset(&key, 0, sizeof(key));
    memset(&bld, 0, sizeof(bld));

    bld.key = &key;

    /* FIXME: this shouldn't be NULL, but it is on init */
    if (state->vdecl) {
        key.color0in_one = 1;
        key.color1in_one = 1;
        for (i = 0; i < state->vdecl->nelems; i++) {
            uint16_t usage = state->vdecl->usage_map[i];
            if (usage == NINE_DECLUSAGE_POSITIONT)
                key.position_t = 1;
            else if (usage == NINE_DECLUSAGE_i(COLOR, 0))
                key.color0in_one = 0;
            else if (usage == NINE_DECLUSAGE_i(COLOR, 1))
                key.color1in_one = 0;
            else if (usage == NINE_DECLUSAGE_PSIZE)
                key.vertexpointsize = 1;
        }
    }
    if (!key.vertexpointsize)
        key.pointscale = !!state->rs[D3DRS_POINTSCALEENABLE];

    key.lighting = !!state->rs[D3DRS_LIGHTING] &&  state->ff.num_lights_active;
    key.darkness = !!state->rs[D3DRS_LIGHTING] && !state->ff.num_lights_active;
    if (key.position_t) {
        key.darkness = 0; /* |= key.lighting; */ /* XXX ? */
        key.lighting = 0;
    }
    if ((key.lighting | key.darkness) && state->rs[D3DRS_COLORVERTEX]) {
        key.mtl_diffuse = state->rs[D3DRS_DIFFUSEMATERIALSOURCE];
        key.mtl_ambient = state->rs[D3DRS_AMBIENTMATERIALSOURCE];
        key.mtl_specular = state->rs[D3DRS_SPECULARMATERIALSOURCE];
        key.mtl_emissive = state->rs[D3DRS_EMISSIVEMATERIALSOURCE];
    }
    key.fog_mode = state->rs[D3DRS_FOGENABLE] ? state->rs[D3DRS_FOGVERTEXMODE] : 0;
    if (key.fog_mode)
        key.fog_range = !key.position_t && state->rs[D3DRS_RANGEFOGENABLE];

    if (state->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
        key.vertexblend_indexed = !!state->rs[D3DRS_INDEXEDVERTEXBLENDENABLE];

        switch (state->rs[D3DRS_VERTEXBLEND]) {
        case D3DVBF_0WEIGHTS: key.vertexblend = key.vertexblend_indexed; break;
        case D3DVBF_1WEIGHTS: key.vertexblend = 2; break;
        case D3DVBF_2WEIGHTS: key.vertexblend = 3; break;
        case D3DVBF_3WEIGHTS: key.vertexblend = 4; break;
        case D3DVBF_TWEENING: key.vertextween = 1; break;
        default:
            assert(!"invalid D3DVBF");
            break;
        }
    }

    for (s = 0; s < 8; ++s) {
        if (state->ff.tex_stage[s][D3DTSS_COLOROP] == D3DTOP_DISABLE &&
            state->ff.tex_stage[s][D3DTSS_ALPHAOP] == D3DTOP_DISABLE)
            break;
        key.tc_idx |= (state->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] & 7) << (s * 3);
        if (!key.position_t) {
            unsigned gen = (state->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] >> 16) + 1;
            unsigned dim = MIN2(state->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7, 4);
            key.tc_gen |= gen << (s * 3);
            key.tc_dim |= dim << (s * 3);
        } else {
            key.tc_gen |= NINED3DTSS_TCI_PASSTHRU << (s * 3);
        }
    }

    vs = util_hash_table_get(device->ff.ht_vs, &key);
    if (vs)
        return vs;
    NineVertexShader9_new(device, &vs, NULL, nine_ff_build_vs(device, &bld));

    nine_ff_prune_vs(device);
    if (vs) {
        unsigned n;

        memcpy(&vs->ff_key, &key, sizeof(vs->ff_key));

        err = util_hash_table_set(device->ff.ht_vs, &vs->ff_key, vs);
        assert(err == PIPE_OK);
        device->ff.num_vs++;
        NineUnknown_ConvertRefToBind(NineUnknown(vs));

        vs->num_inputs = bld.num_inputs;
        for (n = 0; n < bld.num_inputs; ++n)
            vs->input_map[n].ndecl = bld.input[n];

        vs->position_t = key.position_t;
        vs->point_size = key.vertexpointsize | key.pointscale;
    }
    return vs;
}

static struct NinePixelShader9 *
nine_ff_get_ps(struct NineDevice9 *device)
{
    struct nine_state *state = &device->state;
    struct NinePixelShader9 *ps;
    enum pipe_error err;
    struct nine_ff_ps_key key;
    unsigned s;

    assert(sizeof(key) <= sizeof(key.value32));

    memset(&key, 0, sizeof(key));
    for (s = 0; s < 8; ++s) {
        key.ts[s].colorop = state->ff.tex_stage[s][D3DTSS_COLOROP];
        key.ts[s].alphaop = state->ff.tex_stage[s][D3DTSS_ALPHAOP];
        /* MSDN says D3DTOP_DISABLE disables this and all subsequent stages. */
        /* ALPHAOP cannot be disabled if COLOROP is enabled. */
        if (key.ts[s].colorop == D3DTOP_DISABLE) {
            key.ts[s].alphaop = D3DTOP_DISABLE; /* DISABLE == 1, avoid degenerate keys */
            break;
        }
        if (!state->texture[s] &&
            state->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE) {
            /* This should also disable the stage. */
            key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
            break;
        }
        if (key.ts[s].colorop != D3DTOP_DISABLE) {
            uint8_t used_c = ps_d3dtop_args_mask(key.ts[s].colorop);
            if (used_c & 0x1) key.ts[s].colorarg0 = state->ff.tex_stage[s][D3DTSS_COLORARG0];
            if (used_c & 0x2) key.ts[s].colorarg1 = state->ff.tex_stage[s][D3DTSS_COLORARG1];
            if (used_c & 0x4) key.ts[s].colorarg2 = state->ff.tex_stage[s][D3DTSS_COLORARG2];
            if (used_c & 0x1) key.colorarg_b4[0] |= (state->ff.tex_stage[s][D3DTSS_COLORARG0] >> 4) << s;
            if (used_c & 0x1) key.colorarg_b5[0] |= (state->ff.tex_stage[s][D3DTSS_COLORARG0] >> 5) << s;
            if (used_c & 0x2) key.colorarg_b4[1] |= (state->ff.tex_stage[s][D3DTSS_COLORARG1] >> 4) << s;
            if (used_c & 0x2) key.colorarg_b5[1] |= (state->ff.tex_stage[s][D3DTSS_COLORARG1] >> 5) << s;
            if (used_c & 0x4) key.colorarg_b4[2] |= (state->ff.tex_stage[s][D3DTSS_COLORARG2] >> 4) << s;
            if (used_c & 0x4) key.colorarg_b5[2] |= (state->ff.tex_stage[s][D3DTSS_COLORARG2] >> 5) << s;
        }
        if (key.ts[s].alphaop != D3DTOP_DISABLE) {
            uint8_t used_a = ps_d3dtop_args_mask(key.ts[s].alphaop);
            if (used_a & 0x1) key.ts[s].alphaarg0 = state->ff.tex_stage[s][D3DTSS_ALPHAARG0];
            if (used_a & 0x2) key.ts[s].alphaarg1 = state->ff.tex_stage[s][D3DTSS_ALPHAARG1];
            if (used_a & 0x4) key.ts[s].alphaarg2 = state->ff.tex_stage[s][D3DTSS_ALPHAARG2];
            if (used_a & 0x1) key.alphaarg_b4[0] |= (state->ff.tex_stage[s][D3DTSS_ALPHAARG0] >> 4) << s;
            if (used_a & 0x2) key.alphaarg_b4[1] |= (state->ff.tex_stage[s][D3DTSS_ALPHAARG1] >> 4) << s;
            if (used_a & 0x4) key.alphaarg_b4[2] |= (state->ff.tex_stage[s][D3DTSS_ALPHAARG2] >> 4) << s;
        }
        key.ts[s].resultarg = state->ff.tex_stage[s][D3DTSS_RESULTARG] == D3DTA_TEMP;

        key.ts[s].projected = !!(state->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & D3DTTFF_PROJECTED);

        if (state->texture[s]) {
            switch (state->texture[s]->base.type) {
            case D3DRTYPE_TEXTURE:       key.ts[s].textarget = 1; break;
            case D3DRTYPE_VOLUMETEXTURE: key.ts[s].textarget = 2; break;
            case D3DRTYPE_CUBETEXTURE:   key.ts[s].textarget = 3; break;
            default:
                assert(!"unexpected texture type");
                break;
            }
        } else {
            key.ts[s].textarget = 1;
        }
    }
    for (; s < 8; ++s)
        key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
    if (state->rs[D3DRS_FOGENABLE])
        key.fog_mode = state->rs[D3DRS_FOGTABLEMODE];

    ps = util_hash_table_get(device->ff.ht_ps, &key);
    if (ps)
        return ps;
    NinePixelShader9_new(device, &ps, NULL, nine_ff_build_ps(device, &key));

    nine_ff_prune_ps(device);
    if (ps) {
        memcpy(&ps->ff_key, &key, sizeof(ps->ff_key));

        err = util_hash_table_set(device->ff.ht_ps, &ps->ff_key, ps);
        assert(err == PIPE_OK);
        device->ff.num_ps++;
        NineUnknown_ConvertRefToBind(NineUnknown(ps));

        ps->rt_mask = 0x1;
    }
    return ps;
}

#define GET_D3DTS(n) nine_state_access_transform(state, D3DTS_##n, FALSE)
#define IS_D3DTS_DIRTY(s,n) ((s)->ff.changed.transform[(D3DTS_##n) / 32] & (1 << ((D3DTS_##n) % 32)))
static void
nine_ff_load_vs_transforms(struct NineDevice9 *device)
{
    struct nine_state *state = &device->state;
    D3DMATRIX T;
    D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
    unsigned i;

    /* TODO: make this nicer, and only upload the ones we need */
    /* TODO: use ff.vs_const as storage of W, V, P matrices */

    if (IS_D3DTS_DIRTY(state, WORLD) ||
        IS_D3DTS_DIRTY(state, VIEW) ||
        IS_D3DTS_DIRTY(state, PROJECTION)) {
        /* WVP, WV matrices */
        nine_d3d_matrix_matrix_mul(&M[1], GET_D3DTS(WORLD), GET_D3DTS(VIEW));
        nine_d3d_matrix_matrix_mul(&M[0], &M[1], GET_D3DTS(PROJECTION));

        /* normal matrix == transpose(inverse(WV)) */
        nine_d3d_matrix_inverse_3x3(&T, &M[1]);
        nine_d3d_matrix_transpose(&M[4], &T);

        /* VP matrix */
        nine_d3d_matrix_matrix_mul(&M[2], GET_D3DTS(VIEW), GET_D3DTS(PROJECTION));

        /* V and W matrix */
        M[3] = *GET_D3DTS(VIEW);
        M[56] = *GET_D3DTS(WORLD);
    }

    if (state->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
        /* load other world matrices */
        for (i = 1; i <= 7; ++i)
            M[56 + i] = *GET_D3DTS(WORLDMATRIX(i));
    }

    device->ff.vs_const[30 * 4] = asfloat(state->rs[D3DRS_TWEENFACTOR]);
}

static void
nine_ff_load_lights(struct NineDevice9 *device)
{
    struct nine_state *state = &device->state;
    struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
    unsigned l;

    if (state->changed.group & NINE_STATE_FF_MATERIAL) {
        const D3DMATERIAL9 *mtl = &state->ff.material;

        memcpy(&dst[20], &mtl->Diffuse, 4 * sizeof(float));
        memcpy(&dst[21], &mtl->Ambient, 4 * sizeof(float));
        memcpy(&dst[22], &mtl->Specular, 4 * sizeof(float));
        dst[23].x = mtl->Power;
        memcpy(&dst[24], &mtl->Emissive, 4 * sizeof(float));
        d3dcolor_to_rgba(&dst[25].x, state->rs[D3DRS_AMBIENT]);
        dst[19].x = dst[25].x * mtl->Ambient.r + mtl->Emissive.r;
        dst[19].y = dst[25].y * mtl->Ambient.g + mtl->Emissive.g;
        dst[19].z = dst[25].z * mtl->Ambient.b + mtl->Emissive.b;
        dst[19].w = mtl->Ambient.a + mtl->Emissive.a;
    }

    if (!(state->changed.group & NINE_STATE_FF_LIGHTING))
        return;

    for (l = 0; l < state->ff.num_lights_active; ++l) {
        const D3DLIGHT9 *light = &state->ff.light[state->ff.active_light[l]];

        dst[32 + l * 8].x = light->Type;
        dst[32 + l * 8].y = light->Attenuation0;
        dst[32 + l * 8].z = light->Attenuation1;
        dst[32 + l * 8].w = light->Attenuation2;
        memcpy(&dst[33 + l * 8].x, &light->Diffuse, sizeof(light->Diffuse));
        memcpy(&dst[34 + l * 8].x, &light->Specular, sizeof(light->Specular));
        memcpy(&dst[35 + l * 8].x, &light->Ambient, sizeof(light->Ambient));
        nine_d3d_vector4_matrix_mul((D3DVECTOR *)&dst[36 + l * 8].x, &light->Position, GET_D3DTS(VIEW));
        nine_d3d_vector3_matrix_mul((D3DVECTOR *)&dst[37 + l * 8].x, &light->Direction, GET_D3DTS(VIEW));
        dst[36 + l * 8].w = light->Type == D3DLIGHT_DIRECTIONAL ? 1e9f : light->Range;
        dst[37 + l * 8].w = light->Falloff;
        dst[38 + l * 8].x = cosf(light->Theta * 0.5f);
        dst[38 + l * 8].y = cosf(light->Phi * 0.5f);
        dst[38 + l * 8].z = 1.0f / (dst[38 + l * 8].x - dst[38 + l * 8].y);
        dst[39 + l * 8].w = (l + 1) == state->ff.num_lights_active;
    }
}

static void
nine_ff_load_point_and_fog_params(struct NineDevice9 *device)
{
    const struct nine_state *state = &device->state;
    struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;

    if (!(state->changed.group & NINE_STATE_FF_OTHER))
        return;
    dst[26].x = asfloat(state->rs[D3DRS_POINTSIZE_MIN]);
    dst[26].y = asfloat(state->rs[D3DRS_POINTSIZE_MAX]);
    dst[26].z = asfloat(state->rs[D3DRS_POINTSIZE]);
    dst[26].w = asfloat(state->rs[D3DRS_POINTSCALE_A]);
    dst[27].x = asfloat(state->rs[D3DRS_POINTSCALE_B]);
    dst[27].y = asfloat(state->rs[D3DRS_POINTSCALE_C]);
    dst[28].x = asfloat(state->rs[D3DRS_FOGEND]);
    dst[28].y = 1.0f / (asfloat(state->rs[D3DRS_FOGEND]) - asfloat(state->rs[D3DRS_FOGSTART]));
    if (isinf(dst[28].y))
        dst[28].y = 0.0f;
    dst[28].z = asfloat(state->rs[D3DRS_FOGDENSITY]);
    d3dcolor_to_rgba(&dst[29].x, state->rs[D3DRS_FOGCOLOR]);
}

static void
nine_ff_load_tex_matrices(struct NineDevice9 *device)
{
    struct nine_state *state = &device->state;
    D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
    unsigned s;

    if (!(state->ff.changed.transform[0] & 0xff0000))
        return;
    for (s = 0; s < 8; ++s) {
        if (IS_D3DTS_DIRTY(state, TEXTURE0 + s))
            M[32 + s] = *nine_state_access_transform(state, D3DTS_TEXTURE0 + s, FALSE);
    }
}

static void
nine_ff_load_ps_params(struct NineDevice9 *device)
{
    const struct nine_state *state = &device->state;
    struct fvec4 *dst = (struct fvec4 *)device->ff.ps_const;
    unsigned s;

    if (!(state->changed.group & (NINE_STATE_FF_PSSTAGES | NINE_STATE_FF_OTHER)))
        return;

    for (s = 0; s < 8; ++s)
        d3dcolor_to_rgba(&dst[s].x, state->ff.tex_stage[s][D3DTSS_CONSTANT]);

    for (s = 0; s < 8; ++s) {
        dst[8 + s].x = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVMAT00]);
        dst[8 + s].y = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVMAT01]);
        dst[8 + s].z = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVMAT10]);
        dst[8 + s].w = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVMAT11]);
        if (s & 1) {
            dst[8 + s / 2].z = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
            dst[8 + s / 2].w = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
        } else {
            dst[8 + s / 2].x = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
            dst[8 + s / 2].y = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
        }
    }

    d3dcolor_to_rgba(&dst[20].x, state->rs[D3DRS_TEXTUREFACTOR]);
    d3dcolor_to_rgba(&dst[21].x, state->rs[D3DRS_FOGCOLOR]);
    dst[22].x = asfloat(state->rs[D3DRS_FOGEND]);
    dst[22].y = 1.0f / (asfloat(state->rs[D3DRS_FOGEND]) - asfloat(state->rs[D3DRS_FOGSTART]));
    dst[22].z = asfloat(state->rs[D3DRS_FOGDENSITY]);
}

static void
nine_ff_load_viewport_info(struct NineDevice9 *device)
{
    D3DVIEWPORT9 *viewport = &device->state.viewport;
    struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
    float diffZ = viewport->MaxZ - viewport->MinZ;

    /* Note: the other functions avoids to fill the const again if nothing changed.
     * But we don't have much to fill, and adding code to allow that may be complex
     * so just fill it always */
    dst[100].x = 2.0f / (float)(viewport->Width);
    dst[100].y = 2.0f / (float)(viewport->Height);
    dst[100].z = (diffZ == 0.0f) ? 0.0f : (1.0f / diffZ);
    dst[101].x = (float)(viewport->X);
    dst[101].y = (float)(viewport->Y);
    dst[101].z = (float)(viewport->MinZ);
}

void
nine_ff_update(struct NineDevice9 *device)
{
    struct pipe_context *pipe = device->pipe;
    struct nine_state *state = &device->state;

    DBG("vs=%p ps=%p\n", device->state.vs, device->state.ps);

    /* NOTE: the only reference belongs to the hash table */
    if (!device->state.vs)
        device->ff.vs = nine_ff_get_vs(device);
    if (!device->state.ps)
        device->ff.ps = nine_ff_get_ps(device);

    if (!device->state.vs) {
        if (device->state.ff.clobber.vs_const) {
            device->state.ff.clobber.vs_const = FALSE;
            device->state.changed.group |=
                NINE_STATE_FF_VSTRANSF |
                NINE_STATE_FF_MATERIAL |
                NINE_STATE_FF_LIGHTING |
                NINE_STATE_FF_OTHER;
            device->state.ff.changed.transform[0] |= 0xff000c;
            device->state.ff.changed.transform[8] |= 0xff;
        }
        nine_ff_load_vs_transforms(device);
        nine_ff_load_tex_matrices(device);
        nine_ff_load_lights(device);
        nine_ff_load_point_and_fog_params(device);
        nine_ff_load_viewport_info(device);

        memset(state->ff.changed.transform, 0, sizeof(state->ff.changed.transform));

        device->state.changed.group |= NINE_STATE_VS;
        device->state.changed.group |= NINE_STATE_VS_CONST;

        if (device->prefer_user_constbuf) {
            struct pipe_context *pipe = device->pipe;
            struct pipe_constant_buffer cb;
            cb.buffer_offset = 0;
            cb.buffer = NULL;
            cb.user_buffer = device->ff.vs_const;
            cb.buffer_size = NINE_FF_NUM_VS_CONST * 4 * sizeof(float);
            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &cb);
        } else {
            struct pipe_box box;
            u_box_1d(0, NINE_FF_NUM_VS_CONST * 4 * sizeof(float), &box);
            pipe->transfer_inline_write(pipe, device->constbuf_vs, 0,
                                        0, &box,
                                        device->ff.vs_const, 0, 0);
            nine_ranges_insert(&device->state.changed.vs_const_f, 0, NINE_FF_NUM_VS_CONST,
                               &device->range_pool);
        }
    }

    if (!device->state.ps) {
        if (device->state.ff.clobber.ps_const) {
            device->state.ff.clobber.ps_const = FALSE;
            device->state.changed.group |=
                NINE_STATE_FF_PSSTAGES |
                NINE_STATE_FF_OTHER;
        }
        nine_ff_load_ps_params(device);

        device->state.changed.group |= NINE_STATE_PS;
        device->state.changed.group |= NINE_STATE_PS_CONST;

        if (device->prefer_user_constbuf) {
            struct pipe_context *pipe = device->pipe;
            struct pipe_constant_buffer cb;
            cb.buffer_offset = 0;
            cb.buffer = NULL;
            cb.user_buffer = device->ff.ps_const;
            cb.buffer_size = NINE_FF_NUM_PS_CONST * 4 * sizeof(float);
            pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &cb);
        } else {
            struct pipe_box box;
            u_box_1d(0, NINE_FF_NUM_PS_CONST * 4 * sizeof(float), &box);
            pipe->transfer_inline_write(pipe, device->constbuf_ps, 0,
                                        0, &box,
                                        device->ff.ps_const, 0, 0);
            nine_ranges_insert(&device->state.changed.ps_const_f, 0, NINE_FF_NUM_PS_CONST,
                               &device->range_pool);
        }
    }

    device->state.changed.group &= ~NINE_STATE_FF;
}


boolean
nine_ff_init(struct NineDevice9 *device)
{
    device->ff.ht_vs = util_hash_table_create(nine_ff_vs_key_hash,
                                              nine_ff_vs_key_comp);
    device->ff.ht_ps = util_hash_table_create(nine_ff_ps_key_hash,
                                              nine_ff_ps_key_comp);

    device->ff.ht_fvf = util_hash_table_create(nine_ff_fvf_key_hash,
                                               nine_ff_fvf_key_comp);

    device->ff.vs_const = CALLOC(NINE_FF_NUM_VS_CONST, 4 * sizeof(float));
    device->ff.ps_const = CALLOC(NINE_FF_NUM_PS_CONST, 4 * sizeof(float));

    return device->ff.ht_vs && device->ff.ht_ps &&
        device->ff.ht_fvf &&
        device->ff.vs_const && device->ff.ps_const;
}

static enum pipe_error nine_ff_ht_delete_cb(void *key, void *value, void *data)
{
    NineUnknown_Unbind(NineUnknown(value));
    return PIPE_OK;
}

void
nine_ff_fini(struct NineDevice9 *device)
{
    if (device->ff.ht_vs) {
        util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
        util_hash_table_destroy(device->ff.ht_vs);
    }
    if (device->ff.ht_ps) {
        util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
        util_hash_table_destroy(device->ff.ht_ps);
    }
    if (device->ff.ht_fvf) {
        util_hash_table_foreach(device->ff.ht_fvf, nine_ff_ht_delete_cb, NULL);
        util_hash_table_destroy(device->ff.ht_fvf);
    }
    device->ff.vs = NULL; /* destroyed by unbinding from hash table */
    device->ff.ps = NULL;

    FREE(device->ff.vs_const);
    FREE(device->ff.ps_const);
}

static void
nine_ff_prune_vs(struct NineDevice9 *device)
{
    if (device->ff.num_vs > 100) {
        /* could destroy the bound one here, so unbind */
        device->pipe->bind_vs_state(device->pipe, NULL);
        util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
        util_hash_table_clear(device->ff.ht_vs);
        device->ff.num_vs = 0;
        device->state.changed.group |= NINE_STATE_VS;
    }
}
static void
nine_ff_prune_ps(struct NineDevice9 *device)
{
    if (device->ff.num_ps > 100) {
        /* could destroy the bound one here, so unbind */
        device->pipe->bind_fs_state(device->pipe, NULL);
        util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
        util_hash_table_clear(device->ff.ht_ps);
        device->ff.num_ps = 0;
        device->state.changed.group |= NINE_STATE_PS;
    }
}

/* ========================================================================== */

/* Matrix multiplication:
 *
 * in memory: 0 1 2 3 (row major)
 *            4 5 6 7
 *            8 9 a b
 *            c d e f
 *
 *    cA cB cC cD
 * r0             = (r0 * cA) (r0 * cB) . .
 * r1             = (r1 * cA) (r1 * cB)
 * r2             = (r2 * cA) .
 * r3             = (r3 * cA) .
 *
 *               r: (11) (12) (13) (14)
 *                  (21) (22) (23) (24)
 *                  (31) (32) (33) (34)
 *                  (41) (42) (43) (44)
 * l: (11 12 13 14)
 *    (21 22 23 24)
 *    (31 32 33 34)
 *    (41 42 43 44)
 *
 * v: (x  y  z  1 )
 *
 * t.xyzw = MUL(v.xxxx, r[0]);
 * t.xyzw = MAD(v.yyyy, r[1], t.xyzw);
 * t.xyzw = MAD(v.zzzz, r[2], t.xyzw);
 * v.xyzw = MAD(v.wwww, r[3], t.xyzw);
 *
 * v.x = DP4(v, c[0]);
 * v.y = DP4(v, c[1]);
 * v.z = DP4(v, c[2]);
 * v.w = DP4(v, c[3]) = 1
 */

/*
static void
nine_D3DMATRIX_print(const D3DMATRIX *M)
{
    DBG("\n(%f %f %f %f)\n"
        "(%f %f %f %f)\n"
        "(%f %f %f %f)\n"
        "(%f %f %f %f)\n",
        M->m[0][0], M->m[0][1], M->m[0][2], M->m[0][3],
        M->m[1][0], M->m[1][1], M->m[1][2], M->m[1][3],
        M->m[2][0], M->m[2][1], M->m[2][2], M->m[2][3],
        M->m[3][0], M->m[3][1], M->m[3][2], M->m[3][3]);
}
*/

static INLINE float
nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
{
    return A->m[r][0] * B->m[0][c] +
           A->m[r][1] * B->m[1][c] +
           A->m[r][2] * B->m[2][c] +
           A->m[r][3] * B->m[3][c];
}

static INLINE float
nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
{
    return v->x * M->m[0][c] +
           v->y * M->m[1][c] +
           v->z * M->m[2][c] +
           1.0f * M->m[3][c];
}

static INLINE float
nine_DP3_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
{
    return v->x * M->m[0][c] +
           v->y * M->m[1][c] +
           v->z * M->m[2][c];
}

void
nine_d3d_matrix_matrix_mul(D3DMATRIX *D, const D3DMATRIX *L, const D3DMATRIX *R)
{
    D->_11 = nine_DP4_row_col(L, 0, R, 0);
    D->_12 = nine_DP4_row_col(L, 0, R, 1);
    D->_13 = nine_DP4_row_col(L, 0, R, 2);
    D->_14 = nine_DP4_row_col(L, 0, R, 3);

    D->_21 = nine_DP4_row_col(L, 1, R, 0);
    D->_22 = nine_DP4_row_col(L, 1, R, 1);
    D->_23 = nine_DP4_row_col(L, 1, R, 2);
    D->_24 = nine_DP4_row_col(L, 1, R, 3);

    D->_31 = nine_DP4_row_col(L, 2, R, 0);
    D->_32 = nine_DP4_row_col(L, 2, R, 1);
    D->_33 = nine_DP4_row_col(L, 2, R, 2);
    D->_34 = nine_DP4_row_col(L, 2, R, 3);

    D->_41 = nine_DP4_row_col(L, 3, R, 0);
    D->_42 = nine_DP4_row_col(L, 3, R, 1);
    D->_43 = nine_DP4_row_col(L, 3, R, 2);
    D->_44 = nine_DP4_row_col(L, 3, R, 3);
}

void
nine_d3d_vector4_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
{
    d->x = nine_DP4_vec_col(v, M, 0);
    d->y = nine_DP4_vec_col(v, M, 1);
    d->z = nine_DP4_vec_col(v, M, 2);
}

void
nine_d3d_vector3_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
{
    d->x = nine_DP3_vec_col(v, M, 0);
    d->y = nine_DP3_vec_col(v, M, 1);
    d->z = nine_DP3_vec_col(v, M, 2);
}

void
nine_d3d_matrix_transpose(D3DMATRIX *D, const D3DMATRIX *M)
{
    unsigned i, j;
    for (i = 0; i < 4; ++i)
    for (j = 0; j < 4; ++j)
        D->m[i][j] = M->m[j][i];
}

#define _M_ADD_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
    float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
    if (t > 0.0f) pos += t; else neg += t; } while(0)

#define _M_SUB_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
    float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
    if (t > 0.0f) neg -= t; else pos -= t; } while(0)
float
nine_d3d_matrix_det(const D3DMATRIX *M)
{
    float pos = 0.0f;
    float neg = 0.0f;

    _M_ADD_PROD_1i_2j_3k_4l(1, 2, 3, 4);
    _M_ADD_PROD_1i_2j_3k_4l(1, 3, 4, 2);
    _M_ADD_PROD_1i_2j_3k_4l(1, 4, 2, 3);

    _M_ADD_PROD_1i_2j_3k_4l(2, 1, 4, 3);
    _M_ADD_PROD_1i_2j_3k_4l(2, 3, 1, 4);
    _M_ADD_PROD_1i_2j_3k_4l(2, 4, 3, 1);

    _M_ADD_PROD_1i_2j_3k_4l(3, 1, 2, 4);
    _M_ADD_PROD_1i_2j_3k_4l(3, 2, 4, 1);
    _M_ADD_PROD_1i_2j_3k_4l(3, 4, 1, 2);

    _M_ADD_PROD_1i_2j_3k_4l(4, 1, 3, 2);
    _M_ADD_PROD_1i_2j_3k_4l(4, 2, 1, 3);
    _M_ADD_PROD_1i_2j_3k_4l(4, 3, 2, 1);

    _M_SUB_PROD_1i_2j_3k_4l(1, 2, 4, 3);
    _M_SUB_PROD_1i_2j_3k_4l(1, 3, 2, 4);
    _M_SUB_PROD_1i_2j_3k_4l(1, 4, 3, 2);

    _M_SUB_PROD_1i_2j_3k_4l(2, 1, 3, 4);
    _M_SUB_PROD_1i_2j_3k_4l(2, 3, 4, 1);
    _M_SUB_PROD_1i_2j_3k_4l(2, 4, 1, 3);

    _M_SUB_PROD_1i_2j_3k_4l(3, 1, 4, 2);
    _M_SUB_PROD_1i_2j_3k_4l(3, 2, 1, 4);
    _M_SUB_PROD_1i_2j_3k_4l(3, 4, 2, 1);

    _M_SUB_PROD_1i_2j_3k_4l(4, 1, 2, 3);
    _M_SUB_PROD_1i_2j_3k_4l(4, 2, 3, 1);
    _M_SUB_PROD_1i_2j_3k_4l(4, 3, 1, 2);

    return pos + neg;
}

/* XXX: Probably better to just use src/mesa/math/m_matrix.c because
 * I have no idea where this code came from.
 */
void
nine_d3d_matrix_inverse(D3DMATRIX *D, const D3DMATRIX *M)
{
    int i, k;
    float det;

    D->m[0][0] =
        M->m[1][1] * M->m[2][2] * M->m[3][3] -
        M->m[1][1] * M->m[3][2] * M->m[2][3] -
        M->m[1][2] * M->m[2][1] * M->m[3][3] +
        M->m[1][2] * M->m[3][1] * M->m[2][3] +
        M->m[1][3] * M->m[2][1] * M->m[3][2] -
        M->m[1][3] * M->m[3][1] * M->m[2][2];

    D->m[0][1] =
       -M->m[0][1] * M->m[2][2] * M->m[3][3] +
        M->m[0][1] * M->m[3][2] * M->m[2][3] +
        M->m[0][2] * M->m[2][1] * M->m[3][3] -
        M->m[0][2] * M->m[3][1] * M->m[2][3] -
        M->m[0][3] * M->m[2][1] * M->m[3][2] +
        M->m[0][3] * M->m[3][1] * M->m[2][2];

    D->m[0][2] =
        M->m[0][1] * M->m[1][2] * M->m[3][3] -
        M->m[0][1] * M->m[3][2] * M->m[1][3] -
        M->m[0][2] * M->m[1][1] * M->m[3][3] +
        M->m[0][2] * M->m[3][1] * M->m[1][3] +
        M->m[0][3] * M->m[1][1] * M->m[3][2] -
        M->m[0][3] * M->m[3][1] * M->m[1][2];

    D->m[0][3] =
       -M->m[0][1] * M->m[1][2] * M->m[2][3] +
        M->m[0][1] * M->m[2][2] * M->m[1][3] +
        M->m[0][2] * M->m[1][1] * M->m[2][3] -
        M->m[0][2] * M->m[2][1] * M->m[1][3] -
        M->m[0][3] * M->m[1][1] * M->m[2][2] +
        M->m[0][3] * M->m[2][1] * M->m[1][2];

    D->m[1][0] =
       -M->m[1][0] * M->m[2][2] * M->m[3][3] +
        M->m[1][0] * M->m[3][2] * M->m[2][3] +
        M->m[1][2] * M->m[2][0] * M->m[3][3] -
        M->m[1][2] * M->m[3][0] * M->m[2][3] -
        M->m[1][3] * M->m[2][0] * M->m[3][2] +
        M->m[1][3] * M->m[3][0] * M->m[2][2];

    D->m[1][1] =
        M->m[0][0] * M->m[2][2] * M->m[3][3] -
        M->m[0][0] * M->m[3][2] * M->m[2][3] -
        M->m[0][2] * M->m[2][0] * M->m[3][3] +
        M->m[0][2] * M->m[3][0] * M->m[2][3] +
        M->m[0][3] * M->m[2][0] * M->m[3][2] -
        M->m[0][3] * M->m[3][0] * M->m[2][2];

    D->m[1][2] =
       -M->m[0][0] * M->m[1][2] * M->m[3][3] +
        M->m[0][0] * M->m[3][2] * M->m[1][3] +
        M->m[0][2] * M->m[1][0] * M->m[3][3] -
        M->m[0][2] * M->m[3][0] * M->m[1][3] -
        M->m[0][3] * M->m[1][0] * M->m[3][2] +
        M->m[0][3] * M->m[3][0] * M->m[1][2];

    D->m[1][3] =
        M->m[0][0] * M->m[1][2] * M->m[2][3] -
        M->m[0][0] * M->m[2][2] * M->m[1][3] -
        M->m[0][2] * M->m[1][0] * M->m[2][3] +
        M->m[0][2] * M->m[2][0] * M->m[1][3] +
        M->m[0][3] * M->m[1][0] * M->m[2][2] -
        M->m[0][3] * M->m[2][0] * M->m[1][2];

    D->m[2][0] =
        M->m[1][0] * M->m[2][1] * M->m[3][3] -
        M->m[1][0] * M->m[3][1] * M->m[2][3] -
        M->m[1][1] * M->m[2][0] * M->m[3][3] +
        M->m[1][1] * M->m[3][0] * M->m[2][3] +
        M->m[1][3] * M->m[2][0] * M->m[3][1] -
        M->m[1][3] * M->m[3][0] * M->m[2][1];

    D->m[2][1] =
       -M->m[0][0] * M->m[2][1] * M->m[3][3] +
        M->m[0][0] * M->m[3][1] * M->m[2][3] +
        M->m[0][1] * M->m[2][0] * M->m[3][3] -
        M->m[0][1] * M->m[3][0] * M->m[2][3] -
        M->m[0][3] * M->m[2][0] * M->m[3][1] +
        M->m[0][3] * M->m[3][0] * M->m[2][1];

    D->m[2][2] =
        M->m[0][0] * M->m[1][1] * M->m[3][3] -
        M->m[0][0] * M->m[3][1] * M->m[1][3] -
        M->m[0][1] * M->m[1][0] * M->m[3][3] +
        M->m[0][1] * M->m[3][0] * M->m[1][3] +
        M->m[0][3] * M->m[1][0] * M->m[3][1] -
        M->m[0][3] * M->m[3][0] * M->m[1][1];

    D->m[2][3] =
       -M->m[0][0] * M->m[1][1] * M->m[2][3] +
        M->m[0][0] * M->m[2][1] * M->m[1][3] +
        M->m[0][1] * M->m[1][0] * M->m[2][3] -
        M->m[0][1] * M->m[2][0] * M->m[1][3] -
        M->m[0][3] * M->m[1][0] * M->m[2][1] +
        M->m[0][3] * M->m[2][0] * M->m[1][1];

    D->m[3][0] =
       -M->m[1][0] * M->m[2][1] * M->m[3][2] +
        M->m[1][0] * M->m[3][1] * M->m[2][2] +
        M->m[1][1] * M->m[2][0] * M->m[3][2] -
        M->m[1][1] * M->m[3][0] * M->m[2][2] -
        M->m[1][2] * M->m[2][0] * M->m[3][1] +
        M->m[1][2] * M->m[3][0] * M->m[2][1];

    D->m[3][1] =
        M->m[0][0] * M->m[2][1] * M->m[3][2] -
        M->m[0][0] * M->m[3][1] * M->m[2][2] -
        M->m[0][1] * M->m[2][0] * M->m[3][2] +
        M->m[0][1] * M->m[3][0] * M->m[2][2] +
        M->m[0][2] * M->m[2][0] * M->m[3][1] -
        M->m[0][2] * M->m[3][0] * M->m[2][1];

    D->m[3][2] =
       -M->m[0][0] * M->m[1][1] * M->m[3][2] +
        M->m[0][0] * M->m[3][1] * M->m[1][2] +
        M->m[0][1] * M->m[1][0] * M->m[3][2] -
        M->m[0][1] * M->m[3][0] * M->m[1][2] -
        M->m[0][2] * M->m[1][0] * M->m[3][1] +
        M->m[0][2] * M->m[3][0] * M->m[1][1];

    D->m[3][3] =
        M->m[0][0] * M->m[1][1] * M->m[2][2] -
        M->m[0][0] * M->m[2][1] * M->m[1][2] -
        M->m[0][1] * M->m[1][0] * M->m[2][2] +
        M->m[0][1] * M->m[2][0] * M->m[1][2] +
        M->m[0][2] * M->m[1][0] * M->m[2][1] -
        M->m[0][2] * M->m[2][0] * M->m[1][1];

    det =
        M->m[0][0] * D->m[0][0] +
        M->m[1][0] * D->m[0][1] +
        M->m[2][0] * D->m[0][2] +
        M->m[3][0] * D->m[0][3];

    det = 1.0 / det;

    for (i = 0; i < 4; i++)
    for (k = 0; k < 4; k++)
        D->m[i][k] *= det;

#ifdef DEBUG
    {
        D3DMATRIX I;

        nine_d3d_matrix_matrix_mul(&I, D, M);

        for (i = 0; i < 4; ++i)
        for (k = 0; k < 4; ++k)
            if (fabsf(I.m[i][k] - (float)(i == k)) > 1e-3)
                DBG("Matrix inversion check FAILED !\n");
    }
#endif
}

/* TODO: don't use 4x4 inverse, unless this gets all nicely inlined ? */
void
nine_d3d_matrix_inverse_3x3(D3DMATRIX *D, const D3DMATRIX *M)
{
    D3DMATRIX T;
    unsigned i, j;

    for (i = 0; i < 3; ++i)
    for (j = 0; j < 3; ++j)
        T.m[i][j] = M->m[i][j];
    for (i = 0; i < 3; ++i) {
        T.m[i][3] = 0.0f;
        T.m[3][i] = 0.0f;
    }
    T.m[3][3] = 1.0f;

    nine_d3d_matrix_inverse(D, &T);
}
