/*
 * Copyright © 2014-2015 Broadcom
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "compiler/nir/nir.h"
#include "compiler/nir/nir_deref.h"
#include "nir/nir_to_tgsi.h"
#include "pipe/p_screen.h"
#include "pipe/p_state.h"
#include "tgsi/tgsi_dump.h"
#include "tgsi/tgsi_from_mesa.h"
#include "tgsi/tgsi_info.h"
#include "tgsi/tgsi_ureg.h"
#include "util/debug.h"
#include "util/u_math.h"
#include "util/u_memory.h"

struct ntt_compile {
   nir_shader *s;
   nir_function_impl *impl;
   struct pipe_screen *screen;
   struct ureg_program *ureg;

   bool needs_texcoord_semantic;
   bool any_reg_as_address;
   bool native_integers;
   bool has_txf_lz;

   int next_addr_reg;
   bool addr_declared[2];
   struct ureg_dst addr_reg[2];

   /* if condition set up at the end of a block, for ntt_emit_if(). */
   struct ureg_src if_cond;

   /* TGSI temps for our NIR SSA and register values. */
   struct ureg_dst *reg_temp;
   struct ureg_src *ssa_temp;

   nir_instr_liveness *liveness;

   /* Mappings from driver_location to TGSI input/output number.
    *
    * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
    * their numbers assigned incrementally, unlike inputs or constants.
    */
   struct ureg_src *input_index_map;
   uint64_t centroid_inputs;

   uint32_t first_ubo;

   struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
};

static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);

/**
 * Interprets a nir_load_const used as a NIR src as a uint.
 *
 * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
 * instruction (or in a phi-web used by an integer ALU instruction) were
 * converted to floats and the ALU instruction swapped to the float equivalent.
 * However, this means that integer load_consts used by intrinsics (which don't
 * normally get that conversion) may have been reformatted to be floats.  Given
 * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
 * we can just look and see if they look like floats and convert them back to
 * ints.
 */
static uint32_t
ntt_src_as_uint(struct ntt_compile *c, nir_src src)
{
   uint32_t val = nir_src_as_uint(src);
   if (!c->native_integers && val >= fui(1.0))
      val = (uint32_t)uif(val);
   return val;
}

static unsigned
ntt_64bit_write_mask(unsigned write_mask)
{
   return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
}

static struct ureg_src
ntt_64bit_1f(struct ntt_compile *c)
{
   return ureg_imm4u(c->ureg,
                     0x00000000, 0x3ff00000,
                     0x00000000, 0x3ff00000);
}

static const struct glsl_type *
ntt_shader_input_type(struct ntt_compile *c,
                      struct nir_variable *var)
{
   switch (c->s->info.stage) {
   case MESA_SHADER_GEOMETRY:
   case MESA_SHADER_TESS_EVAL:
   case MESA_SHADER_TESS_CTRL:
      if (glsl_type_is_array(var->type))
         return glsl_get_array_element(var->type);
      else
         return var->type;
   default:
      return var->type;
   }
}

static void
ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
                            unsigned *semantic_name, unsigned *semantic_index)
{
   /* We want to use most of tgsi_get_gl_varying_semantic(), but the
    * !texcoord shifting has already been applied, so avoid that.
    */
   if (!c->needs_texcoord_semantic &&
       (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
      *semantic_name = TGSI_SEMANTIC_GENERIC;
      *semantic_index = location - VARYING_SLOT_VAR0;
      return;
   }

   tgsi_get_gl_varying_semantic(location, true,
                                semantic_name, semantic_index);
}

/* TGSI varying declarations have a component usage mask associated (used by
 * r600 and svga).
 */
static uint32_t
ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
                    bool is_64)
{
   uint32_t usage_mask =
      u_bit_consecutive(start_component, num_components);

   if (is_64) {
      if (start_component >= 2)
         usage_mask >>= 2;

      uint32_t tgsi_usage_mask = 0;

      if (usage_mask & TGSI_WRITEMASK_X)
         tgsi_usage_mask |= TGSI_WRITEMASK_XY;
      if (usage_mask & TGSI_WRITEMASK_Y)
         tgsi_usage_mask |= TGSI_WRITEMASK_ZW;

      return tgsi_usage_mask;
   } else {
      return usage_mask;
   }
}

/* TGSI varying declarations have a component usage mask associated (used by
 * r600 and svga).
 */
static uint32_t
ntt_tgsi_var_usage_mask(const struct nir_variable *var)
{
   const struct glsl_type *type_without_array =
      glsl_without_array(var->type);
   unsigned num_components = glsl_get_vector_elements(type_without_array);
   if (num_components == 0) /* structs */
      num_components = 4;

   return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
                              glsl_type_is_64bit(type_without_array));
}

static struct ureg_dst
ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
{
   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
   int base = nir_intrinsic_base(instr);
   *frac = nir_intrinsic_component(instr);
   bool is_64 = nir_src_bit_size(instr->src[0]) == 64;

   struct ureg_dst out;
   if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
      unsigned semantic_name, semantic_index;
      tgsi_get_gl_frag_result_semantic(semantics.location,
                                       &semantic_name, &semantic_index);
      semantic_index += semantics.dual_source_blend_index;

      switch (semantics.location) {
      case FRAG_RESULT_DEPTH:
         *frac = 2; /* z write is the to the .z channel in TGSI */
         break;
      case FRAG_RESULT_STENCIL:
         *frac = 1;
         break;
      default:
         break;
      }

      out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
   } else {
      unsigned semantic_name, semantic_index;

      ntt_get_gl_varying_semantic(c, semantics.location,
                                  &semantic_name, &semantic_index);

      uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
                                                instr->num_components,
                                                is_64);
      uint32_t gs_streams = semantics.gs_streams;
      for (int i = 0; i < 4; i++) {
         if (!(usage_mask & (1 << i)))
            gs_streams &= ~(0x3 << 2 * i);
      }

      /* No driver appears to use array_id of outputs. */
      unsigned array_id = 0;

      /* This bit is lost in the i/o semantics, but it's unused in in-tree
       * drivers.
       */
      bool invariant = false;

      out = ureg_DECL_output_layout(c->ureg,
                                    semantic_name, semantic_index,
                                    gs_streams,
                                    base,
                                    usage_mask,
                                    array_id,
                                    semantics.num_slots,
                                    invariant);
   }

   unsigned write_mask;
   if (nir_intrinsic_has_write_mask(instr))
      write_mask = nir_intrinsic_write_mask(instr);
   else
      write_mask = ((1 << instr->num_components) - 1) << *frac;

   if (is_64) {
      write_mask = ntt_64bit_write_mask(write_mask);
      if (*frac >= 2)
         write_mask = write_mask << 2;
   } else {
      write_mask = write_mask << *frac;
   }
   return ureg_writemask(out, write_mask);
}

/* If this reg or SSA def is used only for storing an output, then in the simple
 * cases we can write directly to the TGSI output instead of having store_output
 * emit its own MOV.
 */
static bool
ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
                             struct list_head *uses, struct list_head *if_uses)
{
   *dst = ureg_dst_undef();

   switch (c->s->info.stage) {
   case MESA_SHADER_FRAGMENT:
   case MESA_SHADER_VERTEX:
      break;
   default:
      /* tgsi_exec (at least) requires that output stores happen per vertex
       * emitted, you don't get to reuse a previous output value for the next
       * vertex.
       */
      return false;
   }

   if (!list_is_empty(if_uses) || !list_is_singular(uses))
      return false;

   nir_src *src = list_first_entry(uses, nir_src, use_link);

   if (src->parent_instr->type != nir_instr_type_intrinsic)
      return false;

   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr);
   if (intr->intrinsic != nir_intrinsic_store_output ||
       !nir_src_is_const(intr->src[1])) {
      return false;
   }

   uint32_t frac;
   *dst = ntt_output_decl(c, intr, &frac);
   dst->Index += ntt_src_as_uint(c, intr->src[1]);

   return frac == 0;
}

static void
ntt_setup_inputs(struct ntt_compile *c)
{
   if (c->s->info.stage != MESA_SHADER_FRAGMENT)
      return;

   unsigned num_inputs = 0;
   int num_input_arrays = 0;

   nir_foreach_shader_in_variable(var, c->s) {
      const struct glsl_type *type = ntt_shader_input_type(c, var);
      unsigned array_len =
         glsl_count_attribute_slots(type, false);

      num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
   }

   c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);

   nir_foreach_shader_in_variable(var, c->s) {
      const struct glsl_type *type = ntt_shader_input_type(c, var);
      unsigned array_len =
         glsl_count_attribute_slots(type, false);

      unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
      unsigned sample_loc;
      struct ureg_src decl;

      if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
         interpolation =
            tgsi_get_interp_mode(var->data.interpolation,
                                 var->data.location == VARYING_SLOT_COL0 ||
                                 var->data.location == VARYING_SLOT_COL1);

         if (var->data.location == VARYING_SLOT_POS)
            interpolation = TGSI_INTERPOLATE_LINEAR;
      }

      unsigned semantic_name, semantic_index;
      ntt_get_gl_varying_semantic(c, var->data.location,
                                  &semantic_name, &semantic_index);

      if (var->data.sample) {
         sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
      } else if (var->data.centroid) {
         sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
         c->centroid_inputs |= (BITSET_MASK(array_len) <<
                                var->data.driver_location);
      } else {
         sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
      }

      unsigned array_id = 0;
      if (glsl_type_is_array(type))
         array_id = ++num_input_arrays;

      uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);

      decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
                                                semantic_name,
                                                semantic_index,
                                                interpolation,
                                                sample_loc,
                                                var->data.driver_location,
                                                usage_mask,
                                                array_id, array_len);

      if (semantic_name == TGSI_SEMANTIC_FACE) {
         struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
         /* NIR is ~0 front and 0 back, while TGSI is +1 front */
         ureg_SGE(c->ureg, temp, decl, ureg_imm1f(c->ureg, 0));
         decl = ureg_src(temp);
      }

      for (unsigned i = 0; i < array_len; i++) {
         c->input_index_map[var->data.driver_location + i] = decl;
         c->input_index_map[var->data.driver_location + i].Index += i;
      }
   }
}

static int
ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
{
   return a->data.location - b->data.location;
}

/**
 * Workaround for virglrenderer requiring that TGSI FS output color variables
 * are declared in order.  Besides, it's a lot nicer to read the TGSI this way.
 */
static void
ntt_setup_outputs(struct ntt_compile *c)
{
   if (c->s->info.stage != MESA_SHADER_FRAGMENT)
      return;

   nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);

   nir_foreach_shader_out_variable(var, c->s) {
      if (var->data.location == FRAG_RESULT_COLOR)
         ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);

      unsigned semantic_name, semantic_index;
      tgsi_get_gl_frag_result_semantic(var->data.location,
                                       &semantic_name, &semantic_index);

      (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
   }
}

static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
{
   switch (dim) {
   case GLSL_SAMPLER_DIM_1D:
      if (is_shadow)
         return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
      else
         return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
   case GLSL_SAMPLER_DIM_2D:
   case GLSL_SAMPLER_DIM_EXTERNAL:
      if (is_shadow)
         return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
      else
         return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
   case GLSL_SAMPLER_DIM_3D:
      return TGSI_TEXTURE_3D;
   case GLSL_SAMPLER_DIM_CUBE:
      if (is_shadow)
         return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
      else
         return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
   case GLSL_SAMPLER_DIM_RECT:
      if (is_shadow)
         return TGSI_TEXTURE_SHADOWRECT;
      else
         return TGSI_TEXTURE_RECT;
   case GLSL_SAMPLER_DIM_MS:
      return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
   case GLSL_SAMPLER_DIM_BUF:
      return TGSI_TEXTURE_BUFFER;
   default:
      unreachable("unknown sampler dim");
   }
}

static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)
{
   switch (type) {
   case GLSL_TYPE_INT:
      return TGSI_RETURN_TYPE_SINT;
   case GLSL_TYPE_UINT:
      return TGSI_RETURN_TYPE_UINT;
   case GLSL_TYPE_FLOAT:
     return TGSI_RETURN_TYPE_FLOAT;
   default:
      unreachable("unexpected texture type");
   }
}

static void
ntt_setup_uniforms(struct ntt_compile *c)
{
   nir_foreach_uniform_variable(var, c->s) {
      int image_count = glsl_type_get_image_count(var->type);

      if (glsl_type_is_sampler(glsl_without_array(var->type))) {
         /* Don't use this size for the check for samplers -- arrays of structs
          * containing samplers should be ignored, and just the separate lowered
          * sampler uniform decl used.
          */
         int size = glsl_type_get_sampler_count(var->type);

         const struct glsl_type *stype = glsl_without_array(var->type);
         enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
                                                                            glsl_sampler_type_is_array(stype),
                                                                            glsl_sampler_type_is_shadow(stype));
         enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
         for (int i = 0; i < size; i++) {
            ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
               target, ret_type, ret_type, ret_type, ret_type);
            ureg_DECL_sampler(c->ureg, var->data.binding + i);
         }
      } else if (image_count) {
         const struct glsl_type *itype = glsl_without_array(var->type);
         enum tgsi_texture_type tex_type =
             tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
                                                glsl_sampler_type_is_array(itype), false);

         for (int i = 0; i < image_count; i++) {
            c->images[var->data.binding] = ureg_DECL_image(c->ureg,
                                                           var->data.binding + i,
                                                           tex_type,
                                                           var->data.image.format,
                                                           !(var->data.access & ACCESS_NON_WRITEABLE),
                                                           false);
         }
      } else if (glsl_contains_atomic(var->type)) {
         uint32_t offset = var->data.offset / 4;
         uint32_t size = glsl_atomic_size(var->type) / 4;
         ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
      }

      /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
       * size declaration happens with other UBOs below.
       */
   }

   c->first_ubo = ~0;

   unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
   nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
      int ubo = var->data.driver_location;
      if (ubo == -1)
         continue;

      if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
         c->first_ubo = MIN2(c->first_ubo, ubo);

      unsigned size = glsl_get_explicit_size(var->interface_type, false);

      int array_size = 1;
      if (glsl_type_is_interface(glsl_without_array(var->type)))
         array_size = MAX2(1, glsl_array_size(var->type));
      for (int i = 0; i < array_size; i++) {
         /* Even if multiple NIR variables are in the same uniform block, their
          * explicit size is the size of the block.
          */
         if (ubo_sizes[ubo + i])
            assert(ubo_sizes[ubo + i] == size);

         ubo_sizes[ubo + i] = size;
      }
   }

   for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
      if (ubo_sizes[i])
         ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
   }

   for (int i = 0; i < c->s->info.num_ssbos; i++) {
      /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
       * counters
       */
      bool atomic = false;
      ureg_DECL_buffer(c->ureg, i, atomic);
   }
}

static void
ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
{
   foreach_list_typed(nir_register, nir_reg, node, list) {
      struct ureg_dst decl;
      if (nir_reg->num_array_elems == 0) {
         uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
         if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses, &nir_reg->if_uses)) {
            if (nir_reg->bit_size == 64) {
               if (nir_reg->num_components > 2) {
                  fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
                        nir_reg->num_components, nir_reg->index);
               }

               write_mask = ntt_64bit_write_mask(write_mask);
            }

            decl = ureg_writemask(ureg_DECL_temporary(c->ureg), write_mask);
         }
      } else {
         decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems,
                                          true);
      }
      c->reg_temp[nir_reg->index] = decl;
   }
}

static struct ureg_src
ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
{
   int num_components = instr->def.num_components;

   if (!c->native_integers) {
      float values[4];
      assert(instr->def.bit_size == 32);
      for (int i = 0; i < num_components; i++)
         values[i] = uif(instr->value[i].u32);

      return ureg_DECL_immediate(c->ureg, values, num_components);
   } else {
      uint32_t values[4];

      if (instr->def.bit_size == 32) {
         for (int i = 0; i < num_components; i++)
            values[i] = instr->value[i].u32;
      } else {
         assert(num_components <= 2);
         for (int i = 0; i < num_components; i++) {
            values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
            values[i * 2 + 1] = instr->value[i].u64 >> 32;
         }
         num_components *= 2;
      }

      return ureg_DECL_immediate_uint(c->ureg, values, num_components);
   }
}

static struct ureg_src
ntt_reladdr(struct ntt_compile *c, struct ureg_src addr)
{
   if (c->any_reg_as_address) {
      /* Make sure we're getting the refcounting right even on any_reg
       * drivers.
       */
      c->next_addr_reg++;

      return ureg_scalar(addr, 0);
   }

   assert(c->next_addr_reg < ARRAY_SIZE(c->addr_reg));

   if (!c->addr_declared[c->next_addr_reg]) {
      c->addr_reg[c->next_addr_reg] = ureg_writemask(ureg_DECL_address(c->ureg),
                                                     TGSI_WRITEMASK_X);
      c->addr_declared[c->next_addr_reg] = true;
   }

   if (c->native_integers)
      ureg_UARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);
   else
      ureg_ARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);
   return ureg_scalar(ureg_src(c->addr_reg[c->next_addr_reg++]), 0);
}

static void
ntt_put_reladdr(struct ntt_compile *c)
{
   c->next_addr_reg--;
   assert(c->next_addr_reg >= 0);
}

static void
ntt_reladdr_dst_put(struct ntt_compile *c, struct ureg_dst dst)
{
   if (c->any_reg_as_address)
      return;

   if (dst.Indirect)
      ntt_put_reladdr(c);
   if (dst.DimIndirect)
      ntt_put_reladdr(c);
}

static struct ureg_src
ntt_get_src(struct ntt_compile *c, nir_src src)
{
   if (src.is_ssa) {
      if (src.ssa->parent_instr->type == nir_instr_type_load_const)
         return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));

      return c->ssa_temp[src.ssa->index];
   } else {
      nir_register *reg = src.reg.reg;
      struct ureg_dst reg_temp = c->reg_temp[reg->index];
      reg_temp.Index += src.reg.base_offset;

      if (src.reg.indirect) {
         struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
         return ureg_src_indirect(ureg_src(reg_temp),
                                  ntt_reladdr(c, offset));
      } else {
         return ureg_src(reg_temp);
      }
   }
}

static struct ureg_src
ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
{
   nir_alu_src src = instr->src[i];
   struct ureg_src usrc = ntt_get_src(c, src.src);

   if (nir_src_bit_size(src.src) == 64) {
      int chan0 = 0, chan1 = 1;
      if (nir_op_infos[instr->op].input_sizes[i] == 0) {
         chan0 = ffs(instr->dest.write_mask) - 1;
         chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;
         if (chan1 == -1)
            chan1 = chan0;
      }
      usrc = ureg_swizzle(usrc,
                          src.swizzle[chan0] * 2,
                          src.swizzle[chan0] * 2 + 1,
                          src.swizzle[chan1] * 2,
                          src.swizzle[chan1] * 2 + 1);
   } else {
      usrc = ureg_swizzle(usrc,
                          src.swizzle[0],
                          src.swizzle[1],
                          src.swizzle[2],
                          src.swizzle[3]);
   }

   if (src.abs)
      usrc = ureg_abs(usrc);
   if (src.negate)
      usrc = ureg_negate(usrc);

   return usrc;
}

/* Reswizzles a source so that the unset channels in the write mask still refer
 * to one of the channels present in the write mask.
 */
static struct ureg_src
ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
{
   assert(write_mask);
   int first_chan = ffs(write_mask) - 1;
   return ureg_swizzle(src,
                       (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
                       (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
                       (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
                       (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
}

static struct ureg_dst
ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
{
   uint32_t writemask = BITSET_MASK(ssa->num_components);
   if (ssa->bit_size == 64)
      writemask = ntt_64bit_write_mask(writemask);

   struct ureg_dst dst;
   if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))
      dst = ureg_DECL_temporary(c->ureg);

   c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);

   return ureg_writemask(dst, writemask);
}

static struct ureg_dst
ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
{
   if (dest->is_ssa)
      return ntt_get_ssa_def_decl(c, &dest->ssa);
   else
      return c->reg_temp[dest->reg.reg->index];
}

static struct ureg_dst
ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
{
   struct ureg_dst dst = ntt_get_dest_decl(c, dest);

   if (!dest->is_ssa) {
      dst.Index += dest->reg.base_offset;

      if (dest->reg.indirect) {
         struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
         dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset));
      }
   }

   return dst;
}

/* For an SSA dest being populated by a constant src, replace the storage with
 * a copy of the ureg_src.
 */
static void
ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
{
   if (!src.Indirect && !src.DimIndirect) {
      switch (src.File) {
      case TGSI_FILE_IMMEDIATE:
      case TGSI_FILE_INPUT:
      case TGSI_FILE_CONSTANT:
      case TGSI_FILE_SYSTEM_VALUE:
         c->ssa_temp[def->index] = src;
         return;
      }
   }

   ureg_MOV(c->ureg, ntt_get_ssa_def_decl(c, def), src);
}

static void
ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
{
   if (dest->is_ssa)
      ntt_store_def(c, &dest->ssa, src);
   else {
      struct ureg_dst dst = ntt_get_dest(c, dest);
      ureg_MOV(c->ureg, dst, src);
   }
}

static void
ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
                struct ureg_dst dst,
                struct ureg_src src0,
                struct ureg_src src1)
{
   unsigned i;
   int num_src;

   /* POW is the only 2-operand scalar op. */
   if (tgsi_op  == TGSI_OPCODE_POW) {
      num_src = 2;
   } else {
      num_src = 1;
      src1 = src0;
   }

   for (i = 0; i < 4; i++) {
      if (dst.WriteMask & (1 << i)) {
         struct ureg_dst this_dst = dst;
         struct ureg_src srcs[2] = {
            ureg_scalar(src0, i),
            ureg_scalar(src1, i),
         };
         this_dst.WriteMask = (1 << i);

         ureg_insn(c->ureg, tgsi_op, &this_dst, 1, srcs, num_src, false);
      }
   }
}

static void
ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
{
   struct ureg_src src[4];
   struct ureg_dst dst;
   unsigned i;
   int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;
   int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
   int num_srcs = nir_op_infos[instr->op].num_inputs;

   assert(num_srcs <= ARRAY_SIZE(src));
   for (i = 0; i < num_srcs; i++)
      src[i] = ntt_get_alu_src(c, instr, i);
   dst = ntt_get_dest(c, &instr->dest.dest);

   if (instr->dest.saturate)
      dst.Saturate = true;

   if (dst_64)
      dst = ureg_writemask(dst, ntt_64bit_write_mask(instr->dest.write_mask));
   else
      dst = ureg_writemask(dst, instr->dest.write_mask);

   static enum tgsi_opcode op_map[][2] = {
      [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },

      /* fabs/fneg 32-bit are special-cased below. */
      [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
      [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },

      [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
      [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
      [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
      [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
      [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
      [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
      [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
      [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
      [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
      [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },

      [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
      [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
      [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },

      /* The conversions will have one combination of src and dst bitsize. */
      [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
      [nir_op_f2f64] = { TGSI_OPCODE_F2D },
      [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },

      [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
      [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
      [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
      [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
      [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
      [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
      [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
      [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },

      [nir_op_slt] = { TGSI_OPCODE_SLT },
      [nir_op_sge] = { TGSI_OPCODE_SGE },
      [nir_op_seq] = { TGSI_OPCODE_SEQ },
      [nir_op_sne] = { TGSI_OPCODE_SNE },

      [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
      [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
      [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
      [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },

      [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
      [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
      [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
      [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },

      [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
      [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },

      [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
      [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
      [nir_op_fsign] = { TGSI_OPCODE_SSG },
      [nir_op_isign] = { TGSI_OPCODE_ISSG },
      [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
      [nir_op_fddx] = { TGSI_OPCODE_DDX },
      [nir_op_fddy] = { TGSI_OPCODE_DDY },
      [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
      [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
      [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
      [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
      [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
      [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
      [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
      [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
      [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
      [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
      [nir_op_bit_count] = { TGSI_OPCODE_POPC },
      [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
      [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
      [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
      [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
      [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
      [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
      [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
      [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
      [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
      [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
      [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
      [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
      [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
      [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },

      /* These bitwise ops don't care about 32 vs 64 types, so they have the
       * same TGSI op.
       */
      [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
      [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
      [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
      [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },

      [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
      [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
      [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
      [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
      [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
      [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
      [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
      [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
   };

   /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
    * of .xy.  Store to a temp and move it to the real dst.
    */
   bool tgsi_64bit_compare = src_64 && !dst_64 &&
      (num_srcs == 2 ||
        nir_op_infos[instr->op].output_type == nir_type_bool32) &&
      (dst.WriteMask != TGSI_WRITEMASK_X);

   /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
    * channels and will need to get fixed up.
    */
   bool tgsi_64bit_downconvert = (src_64 && !dst_64 &&
                                  num_srcs == 1 && !tgsi_64bit_compare &&
                                  (dst.WriteMask & ~TGSI_WRITEMASK_XY));

   struct ureg_dst real_dst = ureg_dst_undef();
   if (tgsi_64bit_compare || tgsi_64bit_downconvert) {
      real_dst = dst;
      dst = ureg_DECL_temporary(c->ureg);
   }

   bool table_op64 = src_64;
   if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
      /* The normal path for NIR to TGSI ALU op translation */
      ureg_insn(c->ureg, op_map[instr->op][table_op64],
                &dst, 1, src, num_srcs, false);
   } else {
      /* Special cases for NIR to TGSI ALU op translation. */

      /* TODO: Use something like the ntt_store() path for the MOV calls so we
       * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
       */

      switch (instr->op) {
      case nir_op_u2u64:
         ureg_AND(c->ureg, dst, ureg_swizzle(src[0],
                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
                  ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
         break;

      case nir_op_i2i32:
      case nir_op_u2u32:
         assert(src_64);
         ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
         break;

      case nir_op_fabs:
         ureg_MOV(c->ureg, dst, ureg_abs(src[0]));
         break;

      case nir_op_fsat:
         if (dst_64) {
            ureg_MIN(c->ureg, dst, src[0], ntt_64bit_1f(c));
            ureg_MAX(c->ureg, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
         } else {
            ureg_MOV(c->ureg, ureg_saturate(dst), src[0]);
         }
         break;

      case nir_op_fneg:
         ureg_MOV(c->ureg, dst, ureg_negate(src[0]));
         break;

         /* NOTE: TGSI 32-bit math ops have the old "one source channel
          * replicated to all dst channels" behavior, while 64 is normal mapping
          * of src channels to dst.
          */
      case nir_op_frcp:
         assert(!dst_64);
         ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], src[1]);
         break;

      case nir_op_frsq:
         assert(!dst_64);
         ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], src[1]);
         break;

      case nir_op_fsqrt:
         assert(!dst_64);
         ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], src[1]);
         break;

      case nir_op_fexp2:
         assert(!dst_64);
         ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], src[1]);
         break;

      case nir_op_flog2:
         assert(!dst_64);
         ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], src[1]);
         break;

      case nir_op_b2f32:
         ureg_AND(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 1.0));
         break;

      case nir_op_b2f64:
         ureg_AND(c->ureg, dst,
                  ureg_swizzle(src[0],
                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
                  ntt_64bit_1f(c));
         break;

      case nir_op_f2b32:
         if (src_64)
            ureg_DSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
         else
            ureg_FSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
         break;

      case nir_op_i2b32:
         if (src_64) {
            ureg_U64SNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
         } else
            ureg_USNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
         break;

      case nir_op_b2i32:
         ureg_AND(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 1));
         break;

      case nir_op_b2i64:
         ureg_AND(c->ureg, dst,
                  ureg_swizzle(src[0],
                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
                  ureg_imm4u(c->ureg, 1, 0, 1, 0));
         break;

      case nir_op_fsin:
         ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], src[1]);
         break;

      case nir_op_fcos:
         ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], src[1]);
         break;

      case nir_op_fsub:
         assert(!dst_64);
         ureg_ADD(c->ureg, dst, src[0], ureg_negate(src[1]));
         break;

      case nir_op_isub:
         assert(!dst_64);
         ureg_UADD(c->ureg, dst, src[0], ureg_negate(src[1]));
         break;

      case nir_op_fmod:
         unreachable("should be handled by .lower_fmod = true");
         break;

      case nir_op_fpow:
         ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
         break;

      case nir_op_flrp:
         ureg_LRP(c->ureg, dst, src[2], src[1], src[0]);
         break;

      case nir_op_pack_64_2x32_split:
         ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
                  ureg_swizzle(src[0],
                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
         ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_YW),
                  ureg_swizzle(src[1],
                               TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
                               TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
         break;

      case nir_op_unpack_64_2x32_split_x:
         ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
                                             TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
         break;

      case nir_op_unpack_64_2x32_split_y:
         ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
                                             TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
         break;

      case nir_op_b32csel:
         if (nir_src_bit_size(instr->src[1].src) == 64) {
            ureg_UCMP(c->ureg, dst, ureg_swizzle(src[0],
                                                 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
                                                 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
                      src[1], src[2]);
         } else {
            ureg_UCMP(c->ureg, dst, src[0], src[1], src[2]);
         }
         break;

      case nir_op_fcsel:
         /* NIR is src0 != 0 ? src1 : src2.
          * TGSI is src0 < 0 ? src1 : src2.
          *
          * However, fcsel so far as I can find only appears on bools-as-floats
          * (1.0 or 0.0), so we can just negate it for the TGSI op.  It's
          * important to not have an abs here, as i915g has to make extra
          * instructions to do the abs.
          */
         ureg_CMP(c->ureg, dst, ureg_negate(src[0]), src[1], src[2]);
         break;

         /* It would be nice if we could get this left as scalar in NIR, since
          * the TGSI op is scalar.
          */
      case nir_op_frexp_sig:
      case nir_op_frexp_exp: {
         assert(src_64);
         struct ureg_dst temp = ureg_DECL_temporary(c->ureg);

         for (int chan = 0; chan < 2; chan++) {
            int wm = 1 << chan;

            if (!(instr->dest.write_mask & wm))
               continue;

            struct ureg_dst dsts[2] = { temp, temp };
            if (instr->op == nir_op_frexp_sig) {
               dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));
            } else {
               dsts[1] = ureg_writemask(dst, wm);
            }

            struct ureg_src chan_src = ureg_swizzle(src[0],
                                                    chan * 2, chan * 2 + 1,
                                                    chan * 2, chan * 2 + 1);

            ureg_insn(c->ureg, TGSI_OPCODE_DFRACEXP,
                      dsts, 2,
                      &chan_src, 1, false);
         }

         ureg_release_temporary(c->ureg, temp);
         break;
      }

      case nir_op_ldexp:
         assert(dst_64); /* 32bit handled in table. */
         ureg_DLDEXP(c->ureg, dst, src[0],
                     ureg_swizzle(src[1],
                                  TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
                                  TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
         break;

      case nir_op_vec4:
      case nir_op_vec3:
      case nir_op_vec2:
         unreachable("covered by nir_lower_vec_to_movs()");

      default:
         fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
         unreachable("Unknown NIR opcode");
      }
   }

   /* 64-bit op fixup movs */
   if (!ureg_dst_is_undef(real_dst)) {
      if (tgsi_64bit_compare) {
         ureg_MOV(c->ureg, real_dst,
                  ureg_swizzle(ureg_src(dst), 0, 2, 0, 2));
      } else {
         assert(tgsi_64bit_downconvert);
         uint8_t swizzle[] = {0, 0, 0, 0};
         uint32_t second_bit = real_dst.WriteMask & ~(1 << (ffs(real_dst.WriteMask) - 1));
         if (second_bit)
            swizzle[ffs(second_bit) - 1] = 1;
         ureg_MOV(c->ureg, real_dst, ureg_swizzle(ureg_src(dst),
                                                  swizzle[0],
                                                  swizzle[1],
                                                  swizzle[2],
                                                  swizzle[3]));
      }
      ureg_release_temporary(c->ureg, dst);
   }
}

static struct ureg_src
ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
                      nir_src src)
{
   if (nir_src_is_const(src)) {
      usrc.Index += ntt_src_as_uint(c, src);
      return usrc;
   } else {
      return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src)));
   }
}

static struct ureg_dst
ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
                      nir_src src)
{
   if (nir_src_is_const(src)) {
      dst.Index += ntt_src_as_uint(c, src);
      return dst;
   } else {
      return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src)));
   }
}

static struct ureg_src
ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
                         nir_src src)
{
   if (nir_src_is_const(src)) {
      return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
   }
   else
   {
      return ureg_src_dimension_indirect(usrc,
                                         ntt_reladdr(c, ntt_get_src(c, src)),
                                         0);
   }
}

static struct ureg_dst
ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
                                nir_src src)
{
   if (nir_src_is_const(src)) {
      return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
   } else {
      return ureg_dst_dimension_indirect(udst,
                                         ntt_reladdr(c, ntt_get_src(c, src)),
                                         0);
   }
}
/* Some load operations in NIR will have a fractional offset that we need to
 * swizzle down before storing to the result register.
 */
static struct ureg_src
ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
{
   return ureg_swizzle(src,
                       frac,
                       frac + MIN2(num_components - 1, 1),
                       frac + MIN2(num_components - 1, 2),
                       frac + MIN2(num_components - 1, 3));
}


static void
ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
{
   int bit_size = nir_dest_bit_size(instr->dest);
   assert(bit_size == 32 || instr->num_components <= 2);

   struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);

   struct ureg_dst addr_temp = ureg_dst_undef();

   if (nir_src_is_const(instr->src[0])) {
      src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
   } else {
      /* virglrenderer requires that indirect UBO references have the UBO
       * array's base index in the Index field, not added to the indrect
       * address.
       *
       * Many nir intrinsics have a base address const value for the start of
       * their array indirection, but load_ubo doesn't.  We fake it by
       * subtracting it off here.
       */
      addr_temp = ureg_DECL_temporary(c->ureg);
      ureg_UADD(c->ureg, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
      src = ureg_src_dimension_indirect(src,
                                         ntt_reladdr(c, ureg_src(addr_temp)),
                                         c->first_ubo);
   }

   if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
      /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
       * file.
       */

      if (nir_src_is_const(instr->src[1])) {
         src.Index += ntt_src_as_uint(c, instr->src[1]);
      } else {
         src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1])));
      }

      int start_component = nir_intrinsic_component(instr);
      if (bit_size == 64)
         start_component *= 2;

      src = ntt_shift_by_frac(src, start_component,
                              instr->num_components * bit_size / 32);

      ntt_store(c, &instr->dest, src);
   } else {
      /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
       * TGSI_OPCODE_LOAD instruction from the const file.
       */
      struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
      struct ureg_src srcs[2] = {
          src,
          ntt_get_src(c, instr->src[1]),
      };
      ureg_memory_insn(c->ureg, TGSI_OPCODE_LOAD,
                       &dst, 1,
                       srcs, ARRAY_SIZE(srcs),
                       0 /* qualifier */,
                       0 /* tex target */,
                       0 /* format: unused */
      );
   }

   ureg_release_temporary(c->ureg, addr_temp);
}

static unsigned
ntt_get_access_qualifier(nir_intrinsic_instr *instr)
{
   enum gl_access_qualifier access = nir_intrinsic_access(instr);
   unsigned qualifier = 0;

   if (access & ACCESS_COHERENT)
      qualifier |= TGSI_MEMORY_COHERENT;
   if (access & ACCESS_VOLATILE)
      qualifier |= TGSI_MEMORY_VOLATILE;
   if (access & ACCESS_RESTRICT)
      qualifier |= TGSI_MEMORY_RESTRICT;

   return qualifier;
}

static void
ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
             nir_variable_mode mode)
{
   bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
                    instr->intrinsic == nir_intrinsic_store_shared);
   bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
                    instr->intrinsic == nir_intrinsic_load_ssbo ||
                    instr->intrinsic == nir_intrinsic_load_shared);
   unsigned opcode;
   struct ureg_src src[4];
   int num_src = 0;
   int nir_src;
   struct ureg_dst addr_temp = ureg_dst_undef();

   struct ureg_src memory;
   switch (mode) {
   case nir_var_mem_ssbo:
      memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),
                                     instr->src[is_store ? 1 : 0]);
      nir_src = 1;
      break;
   case nir_var_mem_shared:
      memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
      nir_src = 0;
      break;
   case nir_var_uniform: { /* HW atomic buffers */
      memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, 0);
      /* ntt_ureg_src_indirect, except dividing by 4 */
      if (nir_src_is_const(instr->src[0])) {
         memory.Index += nir_src_as_uint(instr->src[0]) / 4;
      } else {
         addr_temp = ureg_DECL_temporary(c->ureg);
         ureg_USHR(c->ureg, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, 2));
         memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp)));
      }
      memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
      nir_src = 0;
      break;
   }

   default:
      unreachable("unknown memory type");
   }

   if (is_store) {
      src[num_src++] = ntt_get_src(c, instr->src[nir_src + 1]); /* offset */
      src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
   } else {
      src[num_src++] = memory;
      if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
         src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* offset */
         switch (instr->intrinsic) {
         case nir_intrinsic_atomic_counter_inc:
            src[num_src++] = ureg_imm1i(c->ureg, 1);
            break;
         case nir_intrinsic_atomic_counter_post_dec:
            src[num_src++] = ureg_imm1i(c->ureg, -1);
            break;
         default:
            if (!is_load)
               src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* value */
            break;
         }
      }
   }


   switch (instr->intrinsic) {
   case nir_intrinsic_atomic_counter_add:
   case nir_intrinsic_atomic_counter_inc:
   case nir_intrinsic_atomic_counter_post_dec:
   case nir_intrinsic_ssbo_atomic_add:
   case nir_intrinsic_shared_atomic_add:
      opcode = TGSI_OPCODE_ATOMUADD;
      break;
   case nir_intrinsic_ssbo_atomic_fadd:
   case nir_intrinsic_shared_atomic_fadd:
      opcode = TGSI_OPCODE_ATOMFADD;
      break;
   case nir_intrinsic_atomic_counter_min:
   case nir_intrinsic_ssbo_atomic_imin:
   case nir_intrinsic_shared_atomic_imin:
      opcode = TGSI_OPCODE_ATOMIMIN;
      break;
   case nir_intrinsic_atomic_counter_max:
   case nir_intrinsic_ssbo_atomic_imax:
   case nir_intrinsic_shared_atomic_imax:
      opcode = TGSI_OPCODE_ATOMIMAX;
      break;
   case nir_intrinsic_ssbo_atomic_umin:
   case nir_intrinsic_shared_atomic_umin:
      opcode = TGSI_OPCODE_ATOMUMIN;
      break;
   case nir_intrinsic_ssbo_atomic_umax:
   case nir_intrinsic_shared_atomic_umax:
      opcode = TGSI_OPCODE_ATOMUMAX;
      break;
   case nir_intrinsic_atomic_counter_and:
   case nir_intrinsic_ssbo_atomic_and:
   case nir_intrinsic_shared_atomic_and:
      opcode = TGSI_OPCODE_ATOMAND;
      break;
   case nir_intrinsic_atomic_counter_or:
   case nir_intrinsic_ssbo_atomic_or:
   case nir_intrinsic_shared_atomic_or:
      opcode = TGSI_OPCODE_ATOMOR;
      break;
   case nir_intrinsic_atomic_counter_xor:
   case nir_intrinsic_ssbo_atomic_xor:
   case nir_intrinsic_shared_atomic_xor:
      opcode = TGSI_OPCODE_ATOMXOR;
      break;
   case nir_intrinsic_atomic_counter_exchange:
   case nir_intrinsic_ssbo_atomic_exchange:
   case nir_intrinsic_shared_atomic_exchange:
      opcode = TGSI_OPCODE_ATOMXCHG;
      break;
   case nir_intrinsic_atomic_counter_comp_swap:
   case nir_intrinsic_ssbo_atomic_comp_swap:
   case nir_intrinsic_shared_atomic_comp_swap:
      opcode = TGSI_OPCODE_ATOMCAS;
      src[num_src++] = ntt_get_src(c, instr->src[nir_src++]);
      break;
   case nir_intrinsic_atomic_counter_read:
   case nir_intrinsic_load_ssbo:
   case nir_intrinsic_load_shared:
      opcode = TGSI_OPCODE_LOAD;
      break;
   case nir_intrinsic_store_ssbo:
   case nir_intrinsic_store_shared:
      opcode = TGSI_OPCODE_STORE;
      break;
   case nir_intrinsic_get_ssbo_size:
      opcode = TGSI_OPCODE_RESQ;
      break;
   default:
      unreachable("unknown memory op");
   }

   unsigned qualifier = 0;
   if (mode == nir_var_mem_ssbo &&
       instr->intrinsic != nir_intrinsic_get_ssbo_size) {
      qualifier = ntt_get_access_qualifier(instr);
   }

   struct ureg_dst dst;
   if (is_store) {
      dst = ureg_dst(memory);

      unsigned write_mask = nir_intrinsic_write_mask(instr);
      if (nir_src_bit_size(instr->src[0]) == 64)
         write_mask = ntt_64bit_write_mask(write_mask);
      dst = ureg_writemask(dst, write_mask);
   } else {
      dst = ntt_get_dest(c, &instr->dest);
   }

   ureg_memory_insn(c->ureg, opcode,
                    &dst, 1,
                    src, num_src,
                    qualifier,
                    TGSI_TEXTURE_BUFFER,
                    0 /* format: unused */);

   ureg_release_temporary(c->ureg, addr_temp);
}

static void
ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
{
   unsigned op;
   struct ureg_src srcs[4];
   int num_src = 0;
   enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
   bool is_array = nir_intrinsic_image_array(instr);

   struct ureg_dst temp = ureg_dst_undef();

   enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);

   struct ureg_src resource =
      ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
                            instr->src[0]);

   struct ureg_dst dst;
   if (instr->intrinsic == nir_intrinsic_image_store) {
      dst = ureg_dst(resource);
   } else {
      srcs[num_src++] = resource;
      dst = ntt_get_dest(c, &instr->dest);
   }

   if (instr->intrinsic != nir_intrinsic_image_size) {
      struct ureg_src coord = ntt_get_src(c, instr->src[1]);

      if (dim == GLSL_SAMPLER_DIM_MS) {
         temp = ureg_DECL_temporary(c->ureg);
         ureg_MOV(c->ureg, temp, coord);
         ureg_MOV(c->ureg, ureg_writemask(temp, 1 << (is_array ? 3 : 2)),
                  ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
         coord = ureg_src(temp);
      }
      srcs[num_src++] = coord;

      if (instr->intrinsic != nir_intrinsic_image_load) {
         srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
         if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
            srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
      }
   }

   switch (instr->intrinsic) {
   case nir_intrinsic_image_load:
      op = TGSI_OPCODE_LOAD;
      break;
   case nir_intrinsic_image_store:
      op = TGSI_OPCODE_STORE;
      break;
   case nir_intrinsic_image_size:
      op = TGSI_OPCODE_RESQ;
      break;
   case nir_intrinsic_image_atomic_add:
      op = TGSI_OPCODE_ATOMUADD;
      break;
   case nir_intrinsic_image_atomic_fadd:
      op = TGSI_OPCODE_ATOMFADD;
      break;
   case nir_intrinsic_image_atomic_imin:
      op = TGSI_OPCODE_ATOMIMIN;
      break;
   case nir_intrinsic_image_atomic_umin:
      op = TGSI_OPCODE_ATOMUMIN;
      break;
   case nir_intrinsic_image_atomic_imax:
      op = TGSI_OPCODE_ATOMIMAX;
      break;
   case nir_intrinsic_image_atomic_umax:
      op = TGSI_OPCODE_ATOMUMAX;
      break;
   case nir_intrinsic_image_atomic_and:
      op = TGSI_OPCODE_ATOMAND;
      break;
   case nir_intrinsic_image_atomic_or:
      op = TGSI_OPCODE_ATOMOR;
      break;
   case nir_intrinsic_image_atomic_xor:
      op = TGSI_OPCODE_ATOMXOR;
      break;
   case nir_intrinsic_image_atomic_exchange:
      op = TGSI_OPCODE_ATOMXCHG;
      break;
   case nir_intrinsic_image_atomic_comp_swap:
      op = TGSI_OPCODE_ATOMCAS;
      break;
   default:
      unreachable("bad op");
   }

   ureg_memory_insn(c->ureg, op, &dst, 1, srcs, num_src,
                    ntt_get_access_qualifier(instr),
                    target,
                    nir_intrinsic_format(instr));

   if (!ureg_dst_is_undef(temp))
      ureg_release_temporary(c->ureg, temp);
}

static void
ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
{
   uint32_t frac = nir_intrinsic_component(instr);
   uint32_t num_components = instr->num_components;
   unsigned base = nir_intrinsic_base(instr);
   struct ureg_src input;
   nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
   bool is_64 = nir_dest_bit_size(instr->dest) == 64;

   if (c->s->info.stage == MESA_SHADER_VERTEX) {
      input = ureg_DECL_vs_input(c->ureg, base);
      for (int i = 1; i < semantics.num_slots; i++)
         ureg_DECL_vs_input(c->ureg, base + i);
   } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
      unsigned semantic_name, semantic_index;
      ntt_get_gl_varying_semantic(c, semantics.location,
                                  &semantic_name, &semantic_index);

      /* XXX: ArrayID is used in r600 gs inputs */
      uint32_t array_id = 0;

      input = ureg_DECL_input_layout(c->ureg,
                                     semantic_name,
                                     semantic_index,
                                     base,
                                     ntt_tgsi_usage_mask(frac,
                                                         instr->num_components,
                                                         is_64),
                                     array_id,
                                     semantics.num_slots);
   } else {
      input = c->input_index_map[base];
   }

   if (is_64)
      num_components *= 2;

   input = ntt_shift_by_frac(input, frac, num_components);

   switch (instr->intrinsic) {
   case nir_intrinsic_load_input:
      input = ntt_ureg_src_indirect(c, input, instr->src[0]);
      ntt_store(c, &instr->dest, input);
      break;

   case nir_intrinsic_load_per_vertex_input:
      input = ntt_ureg_src_indirect(c, input, instr->src[1]);
      input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
      ntt_store(c, &instr->dest, input);
      break;

   case nir_intrinsic_load_interpolated_input: {
      input = ntt_ureg_src_indirect(c, input, instr->src[1]);

      nir_intrinsic_instr *bary_instr =
         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);

      switch (bary_instr->intrinsic) {
      case nir_intrinsic_load_barycentric_pixel:
      case nir_intrinsic_load_barycentric_sample:
         /* For these, we know that the barycentric load matches the
          * interpolation on the input declaration, so we can use it directly.
          */
         ntt_store(c, &instr->dest, input);
         break;

      case nir_intrinsic_load_barycentric_centroid:
         /* If the input was declared centroid, then there's no need to
          * emit the extra TGSI interp instruction, we can just read the
          * input.
          */
         if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
            ntt_store(c, &instr->dest, input);
         } else {
            ureg_INTERP_CENTROID(c->ureg, ntt_get_dest(c, &instr->dest),
                                 input);
         }
         break;

      case nir_intrinsic_load_barycentric_at_sample:
         /* We stored the sample in the fake "bary" dest. */
         ureg_INTERP_SAMPLE(c->ureg, ntt_get_dest(c, &instr->dest), input,
                            ntt_get_src(c, instr->src[0]));
         break;

      case nir_intrinsic_load_barycentric_at_offset:
         /* We stored the offset in the fake "bary" dest. */
         ureg_INTERP_OFFSET(c->ureg, ntt_get_dest(c, &instr->dest), input,
                            ntt_get_src(c, instr->src[0]));
         break;

      default:
         unreachable("bad barycentric interp intrinsic\n");
      }
      break;
   }

   default:
      unreachable("bad load input intrinsic\n");
   }
}

static void
ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
{
   struct ureg_src src = ntt_get_src(c, instr->src[0]);

   if (src.File == TGSI_FILE_OUTPUT) {
      /* If our src is the output file, that's an indication that we were able
       * to emit the output stores in the generating instructions and we have
       * nothing to do here.
       */
      return;
   }

   uint32_t frac;
   struct ureg_dst out = ntt_output_decl(c, instr, &frac);

   if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
      out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
      out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
   } else {
      out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
   }

   uint8_t swizzle[4] = { 0, 0, 0, 0 };
   for (int i = frac; i <= 4; i++) {
      if (out.WriteMask & (1 << i))
         swizzle[i] = i - frac;
   }

   src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);

   ureg_MOV(c->ureg, out, src);
   ntt_reladdr_dst_put(c, out);
}

static void
ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
{
   /* ntt_try_store_in_tgsi_output() optimization is not valid if load_output
    * is present.
    */
   assert(c->s->info.stage != MESA_SHADER_VERTEX &&
          c->s->info.stage != MESA_SHADER_FRAGMENT);

   uint32_t frac;
   struct ureg_dst out = ntt_output_decl(c, instr, &frac);

   if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
      out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
      out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
   } else {
      out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
   }

   ureg_MOV(c->ureg, ntt_get_dest(c, &instr->dest), ureg_src(out));
   ntt_reladdr_dst_put(c, out);
}

static void
ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
{
   gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
   enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
   struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);

   /* virglrenderer doesn't like references to channels of the sysval that
    * aren't defined, even if they aren't really read.  (GLSL compile fails on
    * gl_NumWorkGroups.w, for example).
    */
   uint32_t write_mask = BITSET_MASK(nir_dest_num_components(instr->dest));
   sv = ntt_swizzle_for_write_mask(sv, write_mask);

   /* TGSI and NIR define these intrinsics as always loading ints, but they can
    * still appear on hardware with non-native-integers fragment shaders using
    * the draw path (i915g).  In that case, having called nir_lower_int_to_float
    * means that we actually want floats instead.
    */
   if (!c->native_integers) {
      switch (instr->intrinsic) {
      case nir_intrinsic_load_vertex_id:
      case nir_intrinsic_load_instance_id:
         ureg_U2F(c->ureg, ntt_get_dest(c, &instr->dest), sv);
         return;

      default:
         break;
      }
   }

   ntt_store(c, &instr->dest, sv);
}

static void
ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
{
   switch (instr->intrinsic) {
   case nir_intrinsic_load_ubo:
   case nir_intrinsic_load_ubo_vec4:
      ntt_emit_load_ubo(c, instr);
      break;

      /* Vertex */
   case nir_intrinsic_load_vertex_id:
   case nir_intrinsic_load_vertex_id_zero_base:
   case nir_intrinsic_load_base_vertex:
   case nir_intrinsic_load_base_instance:
   case nir_intrinsic_load_instance_id:
   case nir_intrinsic_load_draw_id:
   case nir_intrinsic_load_invocation_id:
   case nir_intrinsic_load_frag_coord:
   case nir_intrinsic_load_point_coord:
   case nir_intrinsic_load_front_face:
   case nir_intrinsic_load_sample_id:
   case nir_intrinsic_load_sample_pos:
   case nir_intrinsic_load_sample_mask_in:
   case nir_intrinsic_load_helper_invocation:
   case nir_intrinsic_load_tess_coord:
   case nir_intrinsic_load_patch_vertices_in:
   case nir_intrinsic_load_primitive_id:
   case nir_intrinsic_load_tess_level_outer:
   case nir_intrinsic_load_tess_level_inner:
   case nir_intrinsic_load_local_invocation_id:
   case nir_intrinsic_load_workgroup_id:
   case nir_intrinsic_load_num_workgroups:
   case nir_intrinsic_load_workgroup_size:
   case nir_intrinsic_load_subgroup_size:
   case nir_intrinsic_load_subgroup_invocation:
   case nir_intrinsic_load_subgroup_eq_mask:
   case nir_intrinsic_load_subgroup_ge_mask:
   case nir_intrinsic_load_subgroup_gt_mask:
   case nir_intrinsic_load_subgroup_lt_mask:
      ntt_emit_load_sysval(c, instr);
      break;

   case nir_intrinsic_load_input:
   case nir_intrinsic_load_per_vertex_input:
   case nir_intrinsic_load_interpolated_input:
      ntt_emit_load_input(c, instr);
      break;

   case nir_intrinsic_store_output:
   case nir_intrinsic_store_per_vertex_output:
      ntt_emit_store_output(c, instr);
      break;

   case nir_intrinsic_load_output:
   case nir_intrinsic_load_per_vertex_output:
      ntt_emit_load_output(c, instr);
      break;

   case nir_intrinsic_discard:
      ureg_KILL(c->ureg);
      break;

   case nir_intrinsic_discard_if: {
      struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);

      if (c->native_integers) {
         struct ureg_dst temp = ureg_writemask(ureg_DECL_temporary(c->ureg), 1);
         ureg_AND(c->ureg, temp, cond, ureg_imm1f(c->ureg, 1.0));
         ureg_KILL_IF(c->ureg, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
         ureg_release_temporary(c->ureg, temp);
      } else {
         /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
         ureg_KILL_IF(c->ureg, ureg_negate(cond));
      }
      break;
   }

   case nir_intrinsic_load_ssbo:
   case nir_intrinsic_store_ssbo:
   case nir_intrinsic_ssbo_atomic_add:
   case nir_intrinsic_ssbo_atomic_fadd:
   case nir_intrinsic_ssbo_atomic_imin:
   case nir_intrinsic_ssbo_atomic_imax:
   case nir_intrinsic_ssbo_atomic_umin:
   case nir_intrinsic_ssbo_atomic_umax:
   case nir_intrinsic_ssbo_atomic_and:
   case nir_intrinsic_ssbo_atomic_or:
   case nir_intrinsic_ssbo_atomic_xor:
   case nir_intrinsic_ssbo_atomic_exchange:
   case nir_intrinsic_ssbo_atomic_comp_swap:
   case nir_intrinsic_get_ssbo_size:
      ntt_emit_mem(c, instr, nir_var_mem_ssbo);
      break;

   case nir_intrinsic_load_shared:
   case nir_intrinsic_store_shared:
   case nir_intrinsic_shared_atomic_add:
   case nir_intrinsic_shared_atomic_fadd:
   case nir_intrinsic_shared_atomic_imin:
   case nir_intrinsic_shared_atomic_imax:
   case nir_intrinsic_shared_atomic_umin:
   case nir_intrinsic_shared_atomic_umax:
   case nir_intrinsic_shared_atomic_and:
   case nir_intrinsic_shared_atomic_or:
   case nir_intrinsic_shared_atomic_xor:
   case nir_intrinsic_shared_atomic_exchange:
   case nir_intrinsic_shared_atomic_comp_swap:
      ntt_emit_mem(c, instr, nir_var_mem_shared);
      break;

   case nir_intrinsic_atomic_counter_read:
   case nir_intrinsic_atomic_counter_add:
   case nir_intrinsic_atomic_counter_inc:
   case nir_intrinsic_atomic_counter_post_dec:
   case nir_intrinsic_atomic_counter_min:
   case nir_intrinsic_atomic_counter_max:
   case nir_intrinsic_atomic_counter_and:
   case nir_intrinsic_atomic_counter_or:
   case nir_intrinsic_atomic_counter_xor:
   case nir_intrinsic_atomic_counter_exchange:
   case nir_intrinsic_atomic_counter_comp_swap:
      ntt_emit_mem(c, instr, nir_var_uniform);
      break;
   case nir_intrinsic_atomic_counter_pre_dec:
      unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
      break;

   case nir_intrinsic_image_load:
   case nir_intrinsic_image_store:
   case nir_intrinsic_image_size:
   case nir_intrinsic_image_atomic_add:
   case nir_intrinsic_image_atomic_fadd:
   case nir_intrinsic_image_atomic_imin:
   case nir_intrinsic_image_atomic_umin:
   case nir_intrinsic_image_atomic_imax:
   case nir_intrinsic_image_atomic_umax:
   case nir_intrinsic_image_atomic_and:
   case nir_intrinsic_image_atomic_or:
   case nir_intrinsic_image_atomic_xor:
   case nir_intrinsic_image_atomic_exchange:
   case nir_intrinsic_image_atomic_comp_swap:
      ntt_emit_image_load_store(c, instr);
      break;

   case nir_intrinsic_control_barrier:
   case nir_intrinsic_memory_barrier_tcs_patch:
      ureg_BARRIER(c->ureg);
      break;

   case nir_intrinsic_memory_barrier:
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
                                      TGSI_MEMBAR_SHADER_BUFFER |
                                      TGSI_MEMBAR_ATOMIC_BUFFER |
                                      TGSI_MEMBAR_SHADER_IMAGE |
                                      TGSI_MEMBAR_SHARED));
      break;

   case nir_intrinsic_memory_barrier_atomic_counter:
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));
      break;

   case nir_intrinsic_memory_barrier_buffer:
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));
      break;

   case nir_intrinsic_memory_barrier_image:
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));
      break;

   case nir_intrinsic_memory_barrier_shared:
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));
      break;

   case nir_intrinsic_group_memory_barrier:
      ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
                                      TGSI_MEMBAR_SHADER_BUFFER |
                                      TGSI_MEMBAR_ATOMIC_BUFFER |
                                      TGSI_MEMBAR_SHADER_IMAGE |
                                      TGSI_MEMBAR_SHARED |
                                      TGSI_MEMBAR_THREAD_GROUP));
      break;

   case nir_intrinsic_end_primitive:
      ureg_ENDPRIM(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
      break;

   case nir_intrinsic_emit_vertex:
      ureg_EMIT(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
      break;

      /* In TGSI we don't actually generate the barycentric coords, and emit
       * interp intrinsics later.  However, we do need to store the
       * load_barycentric_at_* argument so that we can use it at that point.
       */
   case nir_intrinsic_load_barycentric_pixel:
   case nir_intrinsic_load_barycentric_centroid:
   case nir_intrinsic_load_barycentric_sample:
      break;
   case nir_intrinsic_load_barycentric_at_sample:
   case nir_intrinsic_load_barycentric_at_offset:
      ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
      break;

   default:
      fprintf(stderr, "Unknown intrinsic: ");
      nir_print_instr(&instr->instr, stderr);
      fprintf(stderr, "\n");
      break;
   }
}

struct ntt_tex_operand_state {
   struct ureg_src srcs[4];
   unsigned i;
};

static void
ntt_push_tex_arg(struct ntt_compile *c,
                 nir_tex_instr *instr,
                 nir_tex_src_type tex_src_type,
                 struct ntt_tex_operand_state *s)
{
   int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
   if (tex_src < 0)
      return;

   s->srcs[s->i++] = ntt_get_src(c, instr->src[tex_src].src);
}

static void
ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
{
   struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
   enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
   unsigned tex_opcode;

   struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
   int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
   if (sampler_src >= 0) {
      struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
      sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr));
   }

   switch (instr->op) {
   case nir_texop_tex:
      if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
         MAX2(instr->coord_components, 2) + instr->is_shadow)
         tex_opcode = TGSI_OPCODE_TXP;
      else
         tex_opcode = TGSI_OPCODE_TEX;
      break;
   case nir_texop_txf:
   case nir_texop_txf_ms:
      tex_opcode = TGSI_OPCODE_TXF;

      if (c->has_txf_lz) {
         int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
         if (lod_src >= 0 &&
             nir_src_is_const(instr->src[lod_src].src) &&
             ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
            tex_opcode = TGSI_OPCODE_TXF_LZ;
         }
      }
      break;
   case nir_texop_txl:
      tex_opcode = TGSI_OPCODE_TXL;
      break;
   case nir_texop_txb:
      tex_opcode = TGSI_OPCODE_TXB;
      break;
   case nir_texop_txd:
      tex_opcode = TGSI_OPCODE_TXD;
      break;
   case nir_texop_txs:
      tex_opcode = TGSI_OPCODE_TXQ;
      break;
   case nir_texop_tg4:
      tex_opcode = TGSI_OPCODE_TG4;
      break;
   case nir_texop_query_levels:
      tex_opcode = TGSI_OPCODE_TXQ;
      break;
   case nir_texop_lod:
      tex_opcode = TGSI_OPCODE_LODQ;
      break;
   case nir_texop_texture_samples:
      tex_opcode = TGSI_OPCODE_TXQS;
      break;
   default:
      unreachable("unsupported tex op");
   }

   struct ntt_tex_operand_state s = { .i = 0 };
   ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
   ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);

   /* non-coord arg for TXQ */
   if (tex_opcode == TGSI_OPCODE_TXQ) {
      ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
      /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
       * scalar
       */
      s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
   }

   if (s.i > 1) {
      if (tex_opcode == TGSI_OPCODE_TEX)
         tex_opcode = TGSI_OPCODE_TEX2;
      if (tex_opcode == TGSI_OPCODE_TXB)
         tex_opcode = TGSI_OPCODE_TXB2;
      if (tex_opcode == TGSI_OPCODE_TXL)
         tex_opcode = TGSI_OPCODE_TXL2;
   }

   if (instr->op == nir_texop_txd) {
      /* Derivs appear in their own src args */
      int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
      int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
      s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
      s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
   }

   if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
      if (c->screen->get_param(c->screen,
                               PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
         sampler = ureg_scalar(sampler, instr->component);
         s.srcs[s.i++] = ureg_src_undef();
      } else {
         s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
      }
   }

   s.srcs[s.i++] = sampler;

   enum tgsi_return_type tex_type;
   switch (instr->dest_type) {
   case nir_type_float32:
      tex_type = TGSI_RETURN_TYPE_FLOAT;
      break;
   case nir_type_int32:
      tex_type = TGSI_RETURN_TYPE_SINT;
      break;
   case nir_type_uint32:
      tex_type = TGSI_RETURN_TYPE_UINT;
      break;
   default:
      unreachable("unknown texture type");
   }

   struct tgsi_texture_offset tex_offsets[4];
   unsigned num_tex_offsets = 0;
   int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
   if (tex_offset_src >= 0) {
      struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);

      tex_offsets[0].File = offset.File;
      tex_offsets[0].Index = offset.Index;
      tex_offsets[0].SwizzleX = offset.SwizzleX;
      tex_offsets[0].SwizzleY = offset.SwizzleY;
      tex_offsets[0].SwizzleZ = offset.SwizzleZ;
      tex_offsets[0].Padding = 0;

      num_tex_offsets = 1;
   }

   struct ureg_dst tex_dst;
   if (instr->op == nir_texop_query_levels)
      tex_dst = ureg_writemask(ureg_DECL_temporary(c->ureg), TGSI_WRITEMASK_W);
   else
      tex_dst = dst;

   ureg_tex_insn(c->ureg, tex_opcode,
                 &tex_dst, 1,
                 target,
                 tex_type,
                 tex_offsets, num_tex_offsets,
                 s.srcs, s.i);

   if (instr->op == nir_texop_query_levels) {
      ureg_MOV(c->ureg, dst, ureg_scalar(ureg_src(tex_dst), 3));
      ureg_release_temporary(c->ureg, tex_dst);
   }
}

static void
ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
{
   switch (jump->type) {
   case nir_jump_break:
      ureg_BRK(c->ureg);
      break;

   case nir_jump_continue:
      ureg_CONT(c->ureg);
      break;

   default:
      fprintf(stderr, "Unknown jump instruction: ");
      nir_print_instr(&jump->instr, stderr);
      fprintf(stderr, "\n");
      abort();
   }
}

static void
ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)
{
   /* Nothing to do but make sure that we have some storage to deref. */
   (void)ntt_get_ssa_def_decl(c, &instr->def);
}

static void
ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
{
   /* There is no addr reg in use before we start emitting an instr. */
   c->next_addr_reg = 0;

   switch (instr->type) {
   case nir_instr_type_deref:
      /* ignored, will be walked by nir_intrinsic_image_*_deref. */
      break;

   case nir_instr_type_alu:
      ntt_emit_alu(c, nir_instr_as_alu(instr));
      break;

   case nir_instr_type_intrinsic:
      ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
      break;

   case nir_instr_type_load_const:
      /* Nothing to do here, as load consts are done directly from
       * ntt_get_src() (since many constant NIR srcs will often get folded
       * directly into a register file index instead of as a TGSI src).
       */
      break;

   case nir_instr_type_tex:
      ntt_emit_texture(c, nir_instr_as_tex(instr));
      break;

   case nir_instr_type_jump:
      ntt_emit_jump(c, nir_instr_as_jump(instr));
      break;

   case nir_instr_type_ssa_undef:
      ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
      break;

   default:
      fprintf(stderr, "Unknown NIR instr type: ");
      nir_print_instr(instr, stderr);
      fprintf(stderr, "\n");
      abort();
   }
}

static void
ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
{
   unsigned label;
   ureg_UIF(c->ureg, c->if_cond, &label);
   ntt_emit_cf_list(c, &if_stmt->then_list);

   if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
      ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
      ureg_ELSE(c->ureg, &label);
      ntt_emit_cf_list(c, &if_stmt->else_list);
   }

   ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
   ureg_ENDIF(c->ureg);
}

static void
ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
{
   /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
    * does reference BGNLOOP's.  Follow the former behavior unless something comes up
    * with a need.
    */
   unsigned begin_label;
   ureg_BGNLOOP(c->ureg, &begin_label);
   ntt_emit_cf_list(c, &loop->body);

   unsigned end_label;
   ureg_ENDLOOP(c->ureg, &end_label);
}

static void
ntt_free_ssa_temp_by_index(struct ntt_compile *c, int index)
{
   /* We do store CONST/IMM/INPUT/etc. in ssa_temp[] */
   if (c->ssa_temp[index].File != TGSI_FILE_TEMPORARY)
      return;

   ureg_release_temporary(c->ureg, ureg_dst(c->ssa_temp[index]));
   memset(&c->ssa_temp[index], 0, sizeof(c->ssa_temp[index]));
}

/* Releases any temporaries for SSA defs with a live interval ending at this
 * instruction.
 */
static bool
ntt_src_live_interval_end_cb(nir_src *src, void *state)
{
   struct ntt_compile *c = state;

   if (src->is_ssa) {
      nir_ssa_def *def = src->ssa;

      if (c->liveness->defs[def->index].end == src->parent_instr->index)
         ntt_free_ssa_temp_by_index(c, def->index);
   }

   return true;
}

static void
ntt_emit_block(struct ntt_compile *c, nir_block *block)
{
   nir_foreach_instr(instr, block) {
      ntt_emit_instr(c, instr);

      nir_foreach_src(instr, ntt_src_live_interval_end_cb, c);
   }

   /* Set up the if condition for ntt_emit_if(), which we have to do before
    * freeing up the temps (the "if" is treated as inside the block for liveness
    * purposes, despite not being an instruction)
    *
    * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
    * looks at all of .xyzw.  No harm in working around the bug.
    */
   nir_if *nif = nir_block_get_following_if(block);
   if (nif)
      c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);

   /* Free up any SSA temps that are unused at the end of the block. */
   unsigned index;
   BITSET_FOREACH_SET(index, block->live_out, BITSET_WORDS(c->impl->ssa_alloc)) {
      unsigned def_end_ip = c->liveness->defs[index].end;
      if (def_end_ip == block->end_ip)
         ntt_free_ssa_temp_by_index(c, index);
   }
}

static void
ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
{
   /* There is no addr reg in use before we start emitting any part of a CF
    * node (such as an if condition)
    */
   c->next_addr_reg = 0;

   foreach_list_typed(nir_cf_node, node, node, list) {
      switch (node->type) {
      case nir_cf_node_block:
         ntt_emit_block(c, nir_cf_node_as_block(node));
         break;

      case nir_cf_node_if:
         ntt_emit_if(c, nir_cf_node_as_if(node));
         break;

      case nir_cf_node_loop:
         ntt_emit_loop(c, nir_cf_node_as_loop(node));
         break;

      default:
         unreachable("unknown CF type");
      }
   }
}

static void
ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
{
   c->impl = impl;
   c->liveness = nir_live_ssa_defs_per_instr(impl);

   c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
   c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);

   ntt_setup_registers(c, &impl->registers);
   ntt_emit_cf_list(c, &impl->body);

   ralloc_free(c->liveness);
   c->liveness = NULL;
}

static int
type_size(const struct glsl_type *type, bool bindless)
{
   return glsl_count_attribute_slots(type, false);
}

/* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
 * can handle for 64-bit values in TGSI.
 */
static bool
ntt_should_vectorize_instr(const nir_instr *instr, void *data)
{
   if (instr->type != nir_instr_type_alu)
      return false;

   nir_alu_instr *alu = nir_instr_as_alu(instr);

   switch (alu->op) {
   case nir_op_ibitfield_extract:
   case nir_op_ubitfield_extract:
   case nir_op_bitfield_insert:
      /* virglrenderer only looks at the .x channel of the offset/bits operands
       * when translating to GLSL.  tgsi.rst doesn't seem to require scalar
       * offset/bits operands.
       *
       * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
       */
      return false;

   default:
      break;
   }

   unsigned num_components = alu->dest.dest.ssa.num_components;

   int src_bit_size = nir_src_bit_size(alu->src[0].src);
   int dst_bit_size = nir_dest_bit_size(alu->dest.dest);

   if (src_bit_size == 64 || dst_bit_size == 64) {
      if (num_components > 1)
         return false;
   }

   return true;
}

static bool
ntt_should_vectorize_io(unsigned align, unsigned bit_size,
                        unsigned num_components, unsigned high_offset,
                        nir_intrinsic_instr *low, nir_intrinsic_instr *high,
                        void *data)
{
   if (bit_size != 32)
      return false;

   /* Our offset alignment should aways be at least 4 bytes */
   if (align < 4)
      return false;

   /* No wrapping off the end of a TGSI reg.  We could do a bit better by
    * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
    * need this restriction.
    */
   unsigned worst_start_component = align == 4 ? 3 : align / 4;
   if (worst_start_component + num_components > 4)
      return false;

   return true;
}

static nir_variable_mode
ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
{
   unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
   unsigned indirect_mask = 0;

   if (!screen->get_shader_param(screen, pipe_stage,
                                 PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
      indirect_mask |= nir_var_shader_in;
   }

   if (!screen->get_shader_param(screen, pipe_stage,
                                 PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
      indirect_mask |= nir_var_shader_out;
   }

   if (!screen->get_shader_param(screen, pipe_stage,
                                 PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
      indirect_mask |= nir_var_function_temp;
   }

   return indirect_mask;
}

static void
ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
{
   bool progress;
   unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
   unsigned control_flow_depth =
      screen->get_shader_param(screen, pipe_stage,
                               PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
   do {
      progress = false;

      NIR_PASS_V(s, nir_lower_vars_to_ssa);

      NIR_PASS(progress, s, nir_copy_prop);
      NIR_PASS(progress, s, nir_opt_algebraic);
      NIR_PASS(progress, s, nir_opt_constant_folding);
      NIR_PASS(progress, s, nir_opt_remove_phis);
      NIR_PASS(progress, s, nir_opt_conditional_discard);
      NIR_PASS(progress, s, nir_opt_dce);
      NIR_PASS(progress, s, nir_opt_dead_cf);
      NIR_PASS(progress, s, nir_opt_cse);
      NIR_PASS(progress, s, nir_opt_find_array_copies);
      NIR_PASS(progress, s, nir_opt_if, true);
      NIR_PASS(progress, s, nir_opt_peephole_select,
               control_flow_depth == 0 ? ~0 : 8, true, true);
      NIR_PASS(progress, s, nir_opt_algebraic);
      NIR_PASS(progress, s, nir_opt_constant_folding);
      nir_load_store_vectorize_options vectorize_opts = {
         .modes = nir_var_mem_ubo,
         .callback = ntt_should_vectorize_io,
         .robust_modes = 0,
      };
      NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
      NIR_PASS(progress, s, nir_opt_shrink_vectors, true);
      NIR_PASS(progress, s, nir_opt_trivial_continues);
      NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
      NIR_PASS(progress, s, nir_opt_undef);
      NIR_PASS(progress, s, nir_opt_loop_unroll);

   } while (progress);
}

/* Scalarizes all 64-bit ALU ops.  Note that we only actually need to
 * scalarize vec3/vec4s, should probably fix that.
 */
static bool
scalarize_64bit(const nir_instr *instr, const void *data)
{
   const nir_alu_instr *alu = nir_instr_as_alu(instr);

   return (nir_dest_bit_size(alu->dest.dest) == 64 ||
           nir_src_bit_size(alu->src[0].src) == 64);
}

static bool
nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
{
   b->cursor = nir_after_instr(&instr->instr);

   switch (instr->intrinsic) {
   case nir_intrinsic_load_ubo:
   case nir_intrinsic_load_ubo_vec4:
   case nir_intrinsic_load_ssbo:
   case nir_intrinsic_load_input:
   case nir_intrinsic_load_interpolated_input:
   case nir_intrinsic_load_per_vertex_input:
   case nir_intrinsic_store_output:
   case nir_intrinsic_store_ssbo:
      break;
   default:
      return false;
   }

   if (instr->num_components <= 2)
      return false;

   bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
   if (has_dest) {
      if (nir_dest_bit_size(instr->dest) != 64)
         return false;
   } else  {
      if (nir_src_bit_size(instr->src[0]) != 64)
          return false;
   }

   nir_intrinsic_instr *first =
      nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
   nir_intrinsic_instr *second =
      nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));

   switch (instr->intrinsic) {
   case nir_intrinsic_load_ubo:
   case nir_intrinsic_load_ubo_vec4:
   case nir_intrinsic_load_ssbo:
   case nir_intrinsic_store_ssbo:
      break;

   default: {
      nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
      semantics.location++;
      semantics.num_slots--;
      nir_intrinsic_set_io_semantics(second, semantics);

      nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
      break;
   }
   }

   first->num_components = 2;
   second->num_components -= 2;
   if (has_dest) {
      first->dest.ssa.num_components = 2;
      second->dest.ssa.num_components -= 2;
   }

   nir_builder_instr_insert(b, &first->instr);
   nir_builder_instr_insert(b, &second->instr);

   if (has_dest) {
      /* Merge the two loads' results back into a vector. */
      nir_ssa_def *channels[4] = {
         nir_channel(b, &first->dest.ssa, 0),
         nir_channel(b, &first->dest.ssa, 1),
         nir_channel(b, &second->dest.ssa, 0),
         second->num_components > 1 ? nir_channel(b, &second->dest.ssa, 1) : NULL,
      };
      nir_ssa_def *new = nir_vec(b, channels, instr->num_components);
      nir_ssa_def_rewrite_uses(&instr->dest.ssa, new);
   } else {
      /* Split the src value across the two stores. */
      b->cursor = nir_before_instr(&instr->instr);

      nir_ssa_def *src0 = instr->src[0].ssa;
      nir_ssa_def *channels[4] = { 0 };
      for (int i = 0; i < instr->num_components; i++)
         channels[i] = nir_channel(b, src0, i);

      nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
      nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);

      nir_instr_rewrite_src(&first->instr, &first->src[0],
                            nir_src_for_ssa(nir_vec(b, channels, 2)));
      nir_instr_rewrite_src(&second->instr, &second->src[0],
                            nir_src_for_ssa(nir_vec(b, &channels[2],
                                                    second->num_components)));
   }

   int offset_src = -1;
   uint32_t offset_amount = 16;

   switch (instr->intrinsic) {
   case nir_intrinsic_load_ssbo:
   case nir_intrinsic_load_ubo:
      offset_src = 1;
      break;
   case nir_intrinsic_load_ubo_vec4:
      offset_src = 1;
      offset_amount = 1;
      break;
   case nir_intrinsic_store_ssbo:
      offset_src = 2;
      break;
   default:
      break;
   }
   if (offset_src != -1) {
      b->cursor = nir_before_instr(&second->instr);
      nir_ssa_def *second_offset =
         nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
      nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
                            nir_src_for_ssa(second_offset));
   }

   /* DCE stores we generated with no writemask (nothing else does this
    * currently).
    */
   if (!has_dest) {
      if (nir_intrinsic_write_mask(first) == 0)
         nir_instr_remove(&first->instr);
      if (nir_intrinsic_write_mask(second) == 0)
         nir_instr_remove(&second->instr);
   }

   nir_instr_remove(&instr->instr);

   return true;
}

static bool
nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
{
   int num_components = instr->def.num_components;

   if (instr->def.bit_size != 64 || num_components <= 2)
      return false;

   b->cursor = nir_before_instr(&instr->instr);

   nir_load_const_instr *first =
      nir_load_const_instr_create(b->shader, 2, 64);
   nir_load_const_instr *second =
      nir_load_const_instr_create(b->shader, num_components - 2, 64);

   first->value[0] = instr->value[0];
   first->value[1] = instr->value[1];
   second->value[0] = instr->value[2];
   if (num_components == 4)
      second->value[1] = instr->value[3];

   nir_builder_instr_insert(b, &first->instr);
   nir_builder_instr_insert(b, &second->instr);

   nir_ssa_def *channels[4] = {
      nir_channel(b, &first->def, 0),
      nir_channel(b, &first->def, 1),
      nir_channel(b, &second->def, 0),
      num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
   };
   nir_ssa_def *new = nir_vec(b, channels, num_components);
   nir_ssa_def_rewrite_uses(&instr->def, new);
   nir_instr_remove(&instr->instr);

   return true;
}

static bool
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
                                      void *data)
{
   switch (instr->type) {
   case nir_instr_type_load_const:
      return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));

   case nir_instr_type_intrinsic:
      return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
   default:
      return false;
   }
}

static bool
nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
{
   return nir_shader_instructions_pass(s,
                                       nir_to_tgsi_lower_64bit_to_vec2_instr,
                                       nir_metadata_block_index |
                                       nir_metadata_dominance,
                                       NULL);
}

struct ntt_lower_tex_state {
   nir_ssa_def *channels[8];
   unsigned i;
};

static void
nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
                                nir_tex_instr *instr,
                                nir_tex_src_type tex_src_type,
                                struct ntt_lower_tex_state *s)
{
   int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
   if (tex_src < 0)
      return;

   assert(instr->src[tex_src].src.is_ssa);

   nir_ssa_def *def = instr->src[tex_src].src.ssa;
   for (int i = 0; i < def->num_components; i++) {
      s->channels[s->i++] = nir_channel(b, def, i);
   }

   nir_tex_instr_remove_src(instr, tex_src);
}

/**
 * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
 * src.  This lets NIR handle the coalescing of the vec4 rather than trying to
 * manage it on our own, and may lead to more vectorization.
 */
static bool
nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
{
   if (instr->type != nir_instr_type_tex)
      return false;

   nir_tex_instr *tex = nir_instr_as_tex(instr);

   if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
      return false;

   /* NIR after lower_tex will have LOD set to 0 for tex ops that wanted
    * implicit lod in shader stages that don't have quad-based derivatives.
    * TGSI doesn't want that, it requires that the backend do implict LOD 0 for
    * those stages.
    */
   if (!nir_shader_supports_implicit_lod(b->shader) && tex->op == nir_texop_txl) {
      int lod_index = nir_tex_instr_src_index(tex, nir_tex_src_lod);
      nir_src *lod_src = &tex->src[lod_index].src;
      if (nir_src_is_const(*lod_src) && nir_src_as_uint(*lod_src) == 0) {
         nir_tex_instr_remove_src(tex, lod_index);
         tex->op = nir_texop_tex;
      }
   }

   b->cursor = nir_before_instr(instr);

   struct ntt_lower_tex_state s = {0};

   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
   /* We always have at least two slots for the coordinate, even on 1D. */
   s.i = MAX2(s.i, 2);

   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
   s.i = MAX2(s.i, 3);

   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);

   /* XXX: LZ */
   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
   nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);

   /* No need to pack undefs in unused channels of the tex instr */
   while (!s.channels[s.i - 1])
      s.i--;

   /* Instead of putting undefs in the unused slots of the vecs, just put in
    * another used channel.  Otherwise, we'll get unnecessary moves into
    * registers.
    */
   assert(s.channels[0] != NULL);
   for (int i = 1; i < s.i; i++) {
      if (!s.channels[i])
         s.channels[i] = s.channels[0];
   }

   nir_tex_instr_add_src(tex, nir_tex_src_backend1, nir_src_for_ssa(nir_vec(b, s.channels, MIN2(s.i, 4))));
   if (s.i > 4)
      nir_tex_instr_add_src(tex, nir_tex_src_backend2, nir_src_for_ssa(nir_vec(b, &s.channels[4], s.i - 4)));

   return true;
}

static bool
nir_to_tgsi_lower_tex(nir_shader *s)
{
   return nir_shader_instructions_pass(s,
                                       nir_to_tgsi_lower_tex_instr,
                                       nir_metadata_block_index |
                                       nir_metadata_dominance,
                                       NULL);
}

static void
ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s)
{
   const struct nir_shader_compiler_options *options = s->options;
   bool lower_fsqrt =
      !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
                                PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);

   nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);

   if (!options->lower_extract_byte ||
       !options->lower_extract_word ||
       !options->lower_insert_byte ||
       !options->lower_insert_word ||
       !options->lower_fdph ||
       !options->lower_flrp64 ||
       !options->lower_fmod ||
       !options->lower_rotate ||
       !options->lower_uniforms_to_ubo ||
       !options->lower_vector_cmp ||
       options->lower_fsqrt != lower_fsqrt ||
       options->force_indirect_unrolling != no_indirects_mask) {
      nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
      *new_options = *s->options;

      new_options->lower_extract_byte = true;
      new_options->lower_extract_word = true;
      new_options->lower_insert_byte = true;
      new_options->lower_insert_word = true;
      new_options->lower_fdph = true;
      new_options->lower_flrp64 = true;
      new_options->lower_fmod = true;
      new_options->lower_rotate = true;
      new_options->lower_uniforms_to_ubo = true,
      new_options->lower_vector_cmp = true;
      new_options->lower_fsqrt = lower_fsqrt;
      new_options->force_indirect_unrolling = no_indirects_mask;

      s->options = new_options;
   }
}

static bool
ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
{
   return (instr->type == nir_instr_type_intrinsic &&
           nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
}

static nir_ssa_def *
ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
{
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);

   nir_ssa_def *old_result = &intr->dest.ssa;
   intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;

   return nir_iadd_imm(b, old_result, -1);
}

static bool
ntt_lower_atomic_pre_dec(nir_shader *s)
{
   return nir_shader_lower_instructions(s,
                                        ntt_lower_atomic_pre_dec_filter,
                                        ntt_lower_atomic_pre_dec_lower, NULL);
}

/* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
static void
nir_to_tgsi_lower_txp(nir_shader *s)
{
   nir_lower_tex_options lower_tex_options = {
       .lower_txp = 0,
   };

   nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
      nir_foreach_instr(instr, block) {
         if (instr->type != nir_instr_type_tex)
            continue;
         nir_tex_instr *tex = nir_instr_as_tex(instr);

         if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
            continue;

         bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
         bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
         bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;

         /* We can do TXP for any tex (not txg) where we can fit all the
          * coordinates and comparator and projector in one vec4 without any
          * other modifiers to add on.
          *
          * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
          * if we get any funny projectors then we just blow them all away.
          */
         if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
            lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
      }
   }

   /* nir_lower_tex must be run even if no options are set, because we need the
    * LOD to be set for query_levels and for non-fragment shaders.
    */
   NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
}

static bool
nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
{
   return (instr->type == nir_instr_type_intrinsic &&
           nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
}

static nir_ssa_def *
nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
{
   nir_variable *var = *(nir_variable **)data;
   if (!var) {
      var = nir_variable_create(b->shader, nir_var_shader_in, glsl_uint_type(), "gl_PrimitiveID");
      var->data.location = VARYING_SLOT_PRIMITIVE_ID;
      b->shader->info.inputs_read |= VARYING_BIT_PRIMITIVE_ID;
      var->data.driver_location = b->shader->num_outputs++;

      *(nir_variable **)data = var;
   }

   nir_io_semantics semantics = {
      .location = var->data.location,
       .num_slots = 1
   };
   return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
                         .base = var->data.driver_location,
                         .io_semantics = semantics);
}

static bool
nir_lower_primid_sysval_to_input(nir_shader *s)
{
   nir_variable *input = NULL;

   return nir_shader_lower_instructions(s,
                                        nir_lower_primid_sysval_to_input_filter,
                                        nir_lower_primid_sysval_to_input_lower, &input);
}

/**
 * Translates the NIR shader to TGSI.
 *
 * This requires some lowering of the NIR shader to prepare it for translation.
 * We take ownership of the NIR shader passed, returning a reference to the new
 * TGSI tokens instead.  If you need to keep the NIR, then pass us a clone.
 */
const void *
nir_to_tgsi(struct nir_shader *s,
            struct pipe_screen *screen)
{
   struct ntt_compile *c;
   const void *tgsi_tokens;
   bool debug = env_var_as_boolean("NIR_TO_TGSI_DEBUG", false);
   nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
   bool native_integers = screen->get_shader_param(screen,
                                                   pipe_shader_type_from_mesa(s->info.stage),
                                                   PIPE_SHADER_CAP_INTEGERS);
   const struct nir_shader_compiler_options *original_options = s->options;

   ntt_fix_nir_options(screen, s);

   NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
              type_size, (nir_lower_io_options)0);
   NIR_PASS_V(s, nir_lower_regs_to_ssa);

   nir_to_tgsi_lower_txp(s);
   NIR_PASS_V(s, nir_to_tgsi_lower_tex);

   /* While TGSI can represent PRIMID as either an input or a system value,
    * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
    * depend on that.
    */
   if (s->info.stage == MESA_SHADER_GEOMETRY)
      NIR_PASS_V(s, nir_lower_primid_sysval_to_input);

   if (s->info.num_abos)
      NIR_PASS_V(s, ntt_lower_atomic_pre_dec);

   if (!original_options->lower_uniforms_to_ubo) {
      NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
                 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
                 !native_integers);
   }

   /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
    * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
    * duplication logic we just make it so that we only see vec2s.
    */
   NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
   NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);

   if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
      NIR_PASS_V(s, nir_lower_ubo_vec4);

   ntt_optimize_nir(s, screen);

   NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);

   bool progress;
   do {
      progress = false;
      NIR_PASS(progress, s, nir_opt_algebraic_late);
      if (progress) {
         NIR_PASS_V(s, nir_copy_prop);
         NIR_PASS_V(s, nir_opt_dce);
         NIR_PASS_V(s, nir_opt_cse);
      }
   } while (progress);

   if (screen->get_shader_param(screen,
                                pipe_shader_type_from_mesa(s->info.stage),
                                PIPE_SHADER_CAP_INTEGERS)) {
      NIR_PASS_V(s, nir_lower_bool_to_int32);
   } else {
      NIR_PASS_V(s, nir_lower_int_to_float);
      NIR_PASS_V(s, nir_lower_bool_to_float);
      /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
      NIR_PASS_V(s, nir_copy_prop);
      NIR_PASS_V(s, nir_opt_dce);
   }

   /* Only lower 32-bit floats.  The only other modifier type officially
    * supported by TGSI is 32-bit integer negates, but even those are broken on
    * virglrenderer, so skip lowering all integer and f64 float mods.
    */
   NIR_PASS_V(s, nir_lower_to_source_mods, nir_lower_float_source_mods);
   NIR_PASS_V(s, nir_convert_from_ssa, true);
   NIR_PASS_V(s, nir_lower_vec_to_movs, NULL, NULL);

   /* locals_to_regs will leave dead derefs that are good to clean up. */
   NIR_PASS_V(s, nir_lower_locals_to_regs);
   NIR_PASS_V(s, nir_opt_dce);

   if (debug) {
      fprintf(stderr, "NIR before translation to TGSI:\n");
      nir_print_shader(s, stderr);
   }

   c = rzalloc(NULL, struct ntt_compile);
   c->screen = screen;

   c->needs_texcoord_semantic =
      screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
   c->any_reg_as_address =
      screen->get_param(screen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);
   c->has_txf_lz =
      screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);

   c->s = s;
   c->native_integers = native_integers;
   c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
   ureg_setup_shader_info(c->ureg, &s->info);

   ntt_setup_inputs(c);
   ntt_setup_outputs(c);
   ntt_setup_uniforms(c);

   if (s->info.stage == MESA_SHADER_FRAGMENT) {
      /* The draw module's polygon stipple layer doesn't respect the chosen
       * coordinate mode, so leave it as unspecified unless we're actually
       * reading the position in the shader already.  See
       * gl-2.1-polygon-stipple-fs on softpipe.
       */
      if ((s->info.inputs_read & VARYING_BIT_POS) ||
          BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
         ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
                       s->info.fs.origin_upper_left ?
                       TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
                       TGSI_FS_COORD_ORIGIN_LOWER_LEFT);

         ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
                       s->info.fs.pixel_center_integer ?
                       TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
                       TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
      }
   }
   /* Emit the main function */
   nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
   ntt_emit_impl(c, impl);
   ureg_END(c->ureg);

   tgsi_tokens = ureg_get_tokens(c->ureg, NULL);

   if (debug) {
      fprintf(stderr, "TGSI after translation from NIR:\n");
      tgsi_dump(tgsi_tokens, 0);
   }

   ureg_destroy(c->ureg);

   ralloc_free(c);
   ralloc_free(s);

   return tgsi_tokens;
}

static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
   .fuse_ffma32 = true,
   .fuse_ffma64 = true,
   .lower_extract_byte = true,
   .lower_extract_word = true,
   .lower_insert_byte = true,
   .lower_insert_word = true,
   .lower_fdph = true,
   .lower_flrp64 = true,
   .lower_fmod = true,
   .lower_rotate = true,
   .lower_uniforms_to_ubo = true,
   .lower_vector_cmp = true,
   .use_interpolated_input_intrinsics = true,
};

/* Returns a default compiler options for drivers with only nir-to-tgsi-based
 * NIR support.
 */
const void *
nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
                                 enum pipe_shader_ir ir,
                                 unsigned shader)
{
   assert(ir == PIPE_SHADER_IR_NIR);
   return &nir_to_tgsi_compiler_options;
}