#include "blas_extended.h"
#include "blas_extended_private.h"
void BLAS_dgemm_x(enum blas_order_type order, enum blas_trans_type transa,
		  enum blas_trans_type transb, int m, int n, int k,
		  double alpha, const double *a, int lda, const double *b,
		  int ldb, double beta, double *c, int ldc,
		  enum blas_prec_type prec)

/* 
 * Purpose
 * =======
 *
 * This routine computes the matrix product:
 *
 *      C   <-  alpha * op(A) * op(B)  +  beta * C .
 * 
 * where op(M) represents either M, M transpose, 
 * or M conjugate transpose.
 *
 * Arguments
 * =========
 *
 * order   (input) enum blas_order_type
 *         Storage format of input matrices A, B, and C.
 *
 * transa  (input) enum blas_trans_type
 *         Operation to be done on matrix A before multiplication.
 *         Can be no operation, transposition, or conjugate transposition.
 *
 * transb  (input) enum blas_trans_type
 *         Operation to be done on matrix B before multiplication.
 *         Can be no operation, transposition, or conjugate transposition.
 * 
 * m n k   (input) int
 *         The dimensions of matrices A, B, and C.
 *         Matrix C is m-by-n matrix.
 *         Matrix A is m-by-k if A is not transposed, 
 *                     k-by-m otherwise.
 *         Matrix B is k-by-n if B is not transposed, 
 *                     n-by-k otherwise.
 *      
 * alpha   (input) double
 *
 * a       (input) const double*
 *         matrix A.
 * 
 * lda     (input) int
 *         leading dimension of A.
 * 
 * b       (input) const double*
 *         matrix B
 *
 * ldb     (input) int
 *         leading dimension of B.
 *
 * beta    (input) double
 *
 * c       (input/output) double*
 *         matrix C
 *
 * ldc     (input) int
 *         leading dimension of C.
 *
 * prec   (input) enum blas_prec_type
 *        Specifies the internal precision to be used.
 *        = blas_prec_single: single precision.
 *        = blas_prec_double: double precision.
 *        = blas_prec_extra : anything at least 1.5 times as accurate
 *                            than double, and wider than 80-bits.
 *                            We use double-double in our implementation.
 *
 */
{
  static const char routine_name[] = "BLAS_dgemm_x";
  switch (prec) {

  case blas_prec_single:
  case blas_prec_double:
  case blas_prec_indigenous:{


      /* Integer Index Variables */
      int i, j, h;

      int ai, bj, ci;
      int aih, bhj, cij;	/* Index into matrices a, b, c during multiply */

      int incai, incaih;	/* Index increments for matrix a */
      int incbj, incbhj;	/* Index increments for matrix b */
      int incci, inccij;	/* Index increments for matrix c */

      /* Input Matrices */
      const double *a_i = a;
      const double *b_i = b;

      /* Output Matrix */
      double *c_i = c;

      /* Input Scalars */
      double alpha_i = alpha;
      double beta_i = beta;

      /* Temporary Floating-Point Variables */
      double a_elem;
      double b_elem;
      double c_elem;
      double prod;
      double sum;
      double tmp1;
      double tmp2;



      /* Test for error conditions */
      if (m < 0)
	BLAS_error(routine_name, -4, m, NULL);
      if (n < 0)
	BLAS_error(routine_name, -5, n, NULL);
      if (k < 0)
	BLAS_error(routine_name, -6, k, NULL);

      if (order == blas_colmajor) {

	if (ldc < m)
	  BLAS_error(routine_name, -14, ldc, NULL);

	if (transa == blas_no_trans) {
	  if (lda < m)
	    BLAS_error(routine_name, -9, lda, NULL);
	} else {
	  if (lda < k)
	    BLAS_error(routine_name, -9, lda, NULL);
	}

	if (transb == blas_no_trans) {
	  if (ldb < k)
	    BLAS_error(routine_name, -11, ldb, NULL);
	} else {
	  if (ldb < n)
	    BLAS_error(routine_name, -11, ldb, NULL);
	}

      } else {
	/* row major */
	if (ldc < n)
	  BLAS_error(routine_name, -14, ldc, NULL);

	if (transa == blas_no_trans) {
	  if (lda < k)
	    BLAS_error(routine_name, -9, lda, NULL);
	} else {
	  if (lda < m)
	    BLAS_error(routine_name, -9, lda, NULL);
	}

	if (transb == blas_no_trans) {
	  if (ldb < n)
	    BLAS_error(routine_name, -11, ldb, NULL);
	} else {
	  if (ldb < k)
	    BLAS_error(routine_name, -11, ldb, NULL);
	}
      }

      /* Test for no-op */
      if (n == 0 || m == 0 || k == 0)
	return;
      if (alpha_i == 0.0 && beta_i == 1.0) {
	return;
      }

      /* Set Index Parameters */
      if (order == blas_colmajor) {
	incci = 1;
	inccij = ldc;

	if (transa == blas_no_trans) {
	  incai = 1;
	  incaih = lda;
	} else {
	  incai = lda;
	  incaih = 1;
	}

	if (transb == blas_no_trans) {
	  incbj = ldb;
	  incbhj = 1;
	} else {
	  incbj = 1;
	  incbhj = ldb;
	}

      } else {
	/* row major */
	incci = ldc;
	inccij = 1;

	if (transa == blas_no_trans) {
	  incai = lda;
	  incaih = 1;
	} else {
	  incai = 1;
	  incaih = lda;
	}

	if (transb == blas_no_trans) {
	  incbj = 1;
	  incbhj = ldb;
	} else {
	  incbj = ldb;
	  incbhj = 1;
	}

      }



      /* Ajustment to increments */







      /* alpha = 0.  In this case, just return beta * C */
      if (alpha_i == 0.0) {

	ci = 0;
	for (i = 0; i < m; i++, ci += incci) {
	  cij = ci;
	  for (j = 0; j < n; j++, cij += inccij) {
	    c_elem = c_i[cij];
	    tmp1 = c_elem * beta_i;
	    c_i[cij] = tmp1;
	  }
	}

      } else if (alpha_i == 1.0) {

	/* Case alpha == 1. */

	if (beta_i == 0.0) {
	  /* Case alpha == 1, beta == 0.   We compute  C <--- A * B */

	  ci = 0;
	  ai = 0;
	  for (i = 0; i < m; i++, ci += incci, ai += incai) {

	    cij = ci;
	    bj = 0;

	    for (j = 0; j < n; j++, cij += inccij, bj += incbj) {

	      aih = ai;
	      bhj = bj;

	      sum = 0.0;

	      for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
		a_elem = a_i[aih];
		b_elem = b_i[bhj];
		if (transa == blas_conj_trans) {

		}
		if (transb == blas_conj_trans) {

		}
		prod = a_elem * b_elem;
		sum = sum + prod;
	      }
	      c_i[cij] = sum;
	    }
	  }

	} else {
	  /* Case alpha == 1, but beta != 0.
	     We compute   C <--- A * B + beta * C   */

	  ci = 0;
	  ai = 0;
	  for (i = 0; i < m; i++, ci += incci, ai += incai) {

	    cij = ci;
	    bj = 0;

	    for (j = 0; j < n; j++, cij += inccij, bj += incbj) {

	      aih = ai;
	      bhj = bj;

	      sum = 0.0;

	      for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
		a_elem = a_i[aih];
		b_elem = b_i[bhj];
		if (transa == blas_conj_trans) {

		}
		if (transb == blas_conj_trans) {

		}
		prod = a_elem * b_elem;
		sum = sum + prod;
	      }

	      c_elem = c_i[cij];
	      tmp2 = c_elem * beta_i;
	      tmp1 = sum;
	      tmp1 = tmp2 + tmp1;
	      c_i[cij] = tmp1;
	    }
	  }
	}

      } else {

	/* The most general form,   C <-- alpha * A * B + beta * C  */
	ci = 0;
	ai = 0;
	for (i = 0; i < m; i++, ci += incci, ai += incai) {

	  cij = ci;
	  bj = 0;

	  for (j = 0; j < n; j++, cij += inccij, bj += incbj) {

	    aih = ai;
	    bhj = bj;

	    sum = 0.0;

	    for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
	      a_elem = a_i[aih];
	      b_elem = b_i[bhj];
	      if (transa == blas_conj_trans) {

	      }
	      if (transb == blas_conj_trans) {

	      }
	      prod = a_elem * b_elem;
	      sum = sum + prod;
	    }

	    tmp1 = sum * alpha_i;
	    c_elem = c_i[cij];
	    tmp2 = c_elem * beta_i;
	    tmp1 = tmp1 + tmp2;
	    c_i[cij] = tmp1;
	  }
	}

      }



      break;
    }

  case blas_prec_extra:{


      /* Integer Index Variables */
      int i, j, h;

      int ai, bj, ci;
      int aih, bhj, cij;	/* Index into matrices a, b, c during multiply */

      int incai, incaih;	/* Index increments for matrix a */
      int incbj, incbhj;	/* Index increments for matrix b */
      int incci, inccij;	/* Index increments for matrix c */

      /* Input Matrices */
      const double *a_i = a;
      const double *b_i = b;

      /* Output Matrix */
      double *c_i = c;

      /* Input Scalars */
      double alpha_i = alpha;
      double beta_i = beta;

      /* Temporary Floating-Point Variables */
      double a_elem;
      double b_elem;
      double c_elem;
      double head_prod, tail_prod;
      double head_sum, tail_sum;
      double head_tmp1, tail_tmp1;
      double head_tmp2, tail_tmp2;

      FPU_FIX_DECL;

      /* Test for error conditions */
      if (m < 0)
	BLAS_error(routine_name, -4, m, NULL);
      if (n < 0)
	BLAS_error(routine_name, -5, n, NULL);
      if (k < 0)
	BLAS_error(routine_name, -6, k, NULL);

      if (order == blas_colmajor) {

	if (ldc < m)
	  BLAS_error(routine_name, -14, ldc, NULL);

	if (transa == blas_no_trans) {
	  if (lda < m)
	    BLAS_error(routine_name, -9, lda, NULL);
	} else {
	  if (lda < k)
	    BLAS_error(routine_name, -9, lda, NULL);
	}

	if (transb == blas_no_trans) {
	  if (ldb < k)
	    BLAS_error(routine_name, -11, ldb, NULL);
	} else {
	  if (ldb < n)
	    BLAS_error(routine_name, -11, ldb, NULL);
	}

      } else {
	/* row major */
	if (ldc < n)
	  BLAS_error(routine_name, -14, ldc, NULL);

	if (transa == blas_no_trans) {
	  if (lda < k)
	    BLAS_error(routine_name, -9, lda, NULL);
	} else {
	  if (lda < m)
	    BLAS_error(routine_name, -9, lda, NULL);
	}

	if (transb == blas_no_trans) {
	  if (ldb < n)
	    BLAS_error(routine_name, -11, ldb, NULL);
	} else {
	  if (ldb < k)
	    BLAS_error(routine_name, -11, ldb, NULL);
	}
      }

      /* Test for no-op */
      if (n == 0 || m == 0 || k == 0)
	return;
      if (alpha_i == 0.0 && beta_i == 1.0) {
	return;
      }

      /* Set Index Parameters */
      if (order == blas_colmajor) {
	incci = 1;
	inccij = ldc;

	if (transa == blas_no_trans) {
	  incai = 1;
	  incaih = lda;
	} else {
	  incai = lda;
	  incaih = 1;
	}

	if (transb == blas_no_trans) {
	  incbj = ldb;
	  incbhj = 1;
	} else {
	  incbj = 1;
	  incbhj = ldb;
	}

      } else {
	/* row major */
	incci = ldc;
	inccij = 1;

	if (transa == blas_no_trans) {
	  incai = lda;
	  incaih = 1;
	} else {
	  incai = 1;
	  incaih = lda;
	}

	if (transb == blas_no_trans) {
	  incbj = 1;
	  incbhj = ldb;
	} else {
	  incbj = ldb;
	  incbhj = 1;
	}

      }

      FPU_FIX_START;

      /* Ajustment to increments */







      /* alpha = 0.  In this case, just return beta * C */
      if (alpha_i == 0.0) {

	ci = 0;
	for (i = 0; i < m; i++, ci += incci) {
	  cij = ci;
	  for (j = 0; j < n; j++, cij += inccij) {
	    c_elem = c_i[cij];
	    {
	      /* Compute double_double = double * double. */
	      double a1, a2, b1, b2, con;

	      con = c_elem * split;
	      a1 = con - c_elem;
	      a1 = con - a1;
	      a2 = c_elem - a1;
	      con = beta_i * split;
	      b1 = con - beta_i;
	      b1 = con - b1;
	      b2 = beta_i - b1;

	      head_tmp1 = c_elem * beta_i;
	      tail_tmp1 =
		(((a1 * b1 - head_tmp1) + a1 * b2) + a2 * b1) + a2 * b2;
	    }
	    c_i[cij] = head_tmp1;
	  }
	}

      } else if (alpha_i == 1.0) {

	/* Case alpha == 1. */

	if (beta_i == 0.0) {
	  /* Case alpha == 1, beta == 0.   We compute  C <--- A * B */

	  ci = 0;
	  ai = 0;
	  for (i = 0; i < m; i++, ci += incci, ai += incai) {

	    cij = ci;
	    bj = 0;

	    for (j = 0; j < n; j++, cij += inccij, bj += incbj) {

	      aih = ai;
	      bhj = bj;

	      head_sum = tail_sum = 0.0;

	      for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
		a_elem = a_i[aih];
		b_elem = b_i[bhj];
		if (transa == blas_conj_trans) {

		}
		if (transb == blas_conj_trans) {

		}
		{
		  /* Compute double_double = double * double. */
		  double a1, a2, b1, b2, con;

		  con = a_elem * split;
		  a1 = con - a_elem;
		  a1 = con - a1;
		  a2 = a_elem - a1;
		  con = b_elem * split;
		  b1 = con - b_elem;
		  b1 = con - b1;
		  b2 = b_elem - b1;

		  head_prod = a_elem * b_elem;
		  tail_prod =
		    (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
		}
		{
		  /* Compute double-double = double-double + double-double. */
		  double bv;
		  double s1, s2, t1, t2;

		  /* Add two hi words. */
		  s1 = head_sum + head_prod;
		  bv = s1 - head_sum;
		  s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));

		  /* Add two lo words. */
		  t1 = tail_sum + tail_prod;
		  bv = t1 - tail_sum;
		  t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));

		  s2 += t1;

		  /* Renormalize (s1, s2)  to  (t1, s2) */
		  t1 = s1 + s2;
		  s2 = s2 - (t1 - s1);

		  t2 += s2;

		  /* Renormalize (t1, t2)  */
		  head_sum = t1 + t2;
		  tail_sum = t2 - (head_sum - t1);
		}
	      }
	      c_i[cij] = head_sum;
	    }
	  }

	} else {
	  /* Case alpha == 1, but beta != 0.
	     We compute   C <--- A * B + beta * C   */

	  ci = 0;
	  ai = 0;
	  for (i = 0; i < m; i++, ci += incci, ai += incai) {

	    cij = ci;
	    bj = 0;

	    for (j = 0; j < n; j++, cij += inccij, bj += incbj) {

	      aih = ai;
	      bhj = bj;

	      head_sum = tail_sum = 0.0;

	      for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
		a_elem = a_i[aih];
		b_elem = b_i[bhj];
		if (transa == blas_conj_trans) {

		}
		if (transb == blas_conj_trans) {

		}
		{
		  /* Compute double_double = double * double. */
		  double a1, a2, b1, b2, con;

		  con = a_elem * split;
		  a1 = con - a_elem;
		  a1 = con - a1;
		  a2 = a_elem - a1;
		  con = b_elem * split;
		  b1 = con - b_elem;
		  b1 = con - b1;
		  b2 = b_elem - b1;

		  head_prod = a_elem * b_elem;
		  tail_prod =
		    (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
		}
		{
		  /* Compute double-double = double-double + double-double. */
		  double bv;
		  double s1, s2, t1, t2;

		  /* Add two hi words. */
		  s1 = head_sum + head_prod;
		  bv = s1 - head_sum;
		  s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));

		  /* Add two lo words. */
		  t1 = tail_sum + tail_prod;
		  bv = t1 - tail_sum;
		  t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));

		  s2 += t1;

		  /* Renormalize (s1, s2)  to  (t1, s2) */
		  t1 = s1 + s2;
		  s2 = s2 - (t1 - s1);

		  t2 += s2;

		  /* Renormalize (t1, t2)  */
		  head_sum = t1 + t2;
		  tail_sum = t2 - (head_sum - t1);
		}
	      }

	      c_elem = c_i[cij];
	      {
		/* Compute double_double = double * double. */
		double a1, a2, b1, b2, con;

		con = c_elem * split;
		a1 = con - c_elem;
		a1 = con - a1;
		a2 = c_elem - a1;
		con = beta_i * split;
		b1 = con - beta_i;
		b1 = con - b1;
		b2 = beta_i - b1;

		head_tmp2 = c_elem * beta_i;
		tail_tmp2 =
		  (((a1 * b1 - head_tmp2) + a1 * b2) + a2 * b1) + a2 * b2;
	      }
	      head_tmp1 = head_sum;
	      tail_tmp1 = tail_sum;
	      {
		/* Compute double-double = double-double + double-double. */
		double bv;
		double s1, s2, t1, t2;

		/* Add two hi words. */
		s1 = head_tmp2 + head_tmp1;
		bv = s1 - head_tmp2;
		s2 = ((head_tmp1 - bv) + (head_tmp2 - (s1 - bv)));

		/* Add two lo words. */
		t1 = tail_tmp2 + tail_tmp1;
		bv = t1 - tail_tmp2;
		t2 = ((tail_tmp1 - bv) + (tail_tmp2 - (t1 - bv)));

		s2 += t1;

		/* Renormalize (s1, s2)  to  (t1, s2) */
		t1 = s1 + s2;
		s2 = s2 - (t1 - s1);

		t2 += s2;

		/* Renormalize (t1, t2)  */
		head_tmp1 = t1 + t2;
		tail_tmp1 = t2 - (head_tmp1 - t1);
	      }
	      c_i[cij] = head_tmp1;
	    }
	  }
	}

      } else {

	/* The most general form,   C <-- alpha * A * B + beta * C  */
	ci = 0;
	ai = 0;
	for (i = 0; i < m; i++, ci += incci, ai += incai) {

	  cij = ci;
	  bj = 0;

	  for (j = 0; j < n; j++, cij += inccij, bj += incbj) {

	    aih = ai;
	    bhj = bj;

	    head_sum = tail_sum = 0.0;

	    for (h = 0; h < k; h++, aih += incaih, bhj += incbhj) {
	      a_elem = a_i[aih];
	      b_elem = b_i[bhj];
	      if (transa == blas_conj_trans) {

	      }
	      if (transb == blas_conj_trans) {

	      }
	      {
		/* Compute double_double = double * double. */
		double a1, a2, b1, b2, con;

		con = a_elem * split;
		a1 = con - a_elem;
		a1 = con - a1;
		a2 = a_elem - a1;
		con = b_elem * split;
		b1 = con - b_elem;
		b1 = con - b1;
		b2 = b_elem - b1;

		head_prod = a_elem * b_elem;
		tail_prod =
		  (((a1 * b1 - head_prod) + a1 * b2) + a2 * b1) + a2 * b2;
	      }
	      {
		/* Compute double-double = double-double + double-double. */
		double bv;
		double s1, s2, t1, t2;

		/* Add two hi words. */
		s1 = head_sum + head_prod;
		bv = s1 - head_sum;
		s2 = ((head_prod - bv) + (head_sum - (s1 - bv)));

		/* Add two lo words. */
		t1 = tail_sum + tail_prod;
		bv = t1 - tail_sum;
		t2 = ((tail_prod - bv) + (tail_sum - (t1 - bv)));

		s2 += t1;

		/* Renormalize (s1, s2)  to  (t1, s2) */
		t1 = s1 + s2;
		s2 = s2 - (t1 - s1);

		t2 += s2;

		/* Renormalize (t1, t2)  */
		head_sum = t1 + t2;
		tail_sum = t2 - (head_sum - t1);
	      }
	    }

	    {
	      /* Compute double-double = double-double * double. */
	      double a11, a21, b1, b2, c11, c21, c2, con, t1, t2;

	      con = head_sum * split;
	      a11 = con - head_sum;
	      a11 = con - a11;
	      a21 = head_sum - a11;
	      con = alpha_i * split;
	      b1 = con - alpha_i;
	      b1 = con - b1;
	      b2 = alpha_i - b1;

	      c11 = head_sum * alpha_i;
	      c21 = (((a11 * b1 - c11) + a11 * b2) + a21 * b1) + a21 * b2;

	      c2 = tail_sum * alpha_i;
	      t1 = c11 + c2;
	      t2 = (c2 - (t1 - c11)) + c21;

	      head_tmp1 = t1 + t2;
	      tail_tmp1 = t2 - (head_tmp1 - t1);
	    }
	    c_elem = c_i[cij];
	    {
	      /* Compute double_double = double * double. */
	      double a1, a2, b1, b2, con;

	      con = c_elem * split;
	      a1 = con - c_elem;
	      a1 = con - a1;
	      a2 = c_elem - a1;
	      con = beta_i * split;
	      b1 = con - beta_i;
	      b1 = con - b1;
	      b2 = beta_i - b1;

	      head_tmp2 = c_elem * beta_i;
	      tail_tmp2 =
		(((a1 * b1 - head_tmp2) + a1 * b2) + a2 * b1) + a2 * b2;
	    }
	    {
	      /* Compute double-double = double-double + double-double. */
	      double bv;
	      double s1, s2, t1, t2;

	      /* Add two hi words. */
	      s1 = head_tmp1 + head_tmp2;
	      bv = s1 - head_tmp1;
	      s2 = ((head_tmp2 - bv) + (head_tmp1 - (s1 - bv)));

	      /* Add two lo words. */
	      t1 = tail_tmp1 + tail_tmp2;
	      bv = t1 - tail_tmp1;
	      t2 = ((tail_tmp2 - bv) + (tail_tmp1 - (t1 - bv)));

	      s2 += t1;

	      /* Renormalize (s1, s2)  to  (t1, s2) */
	      t1 = s1 + s2;
	      s2 = s2 - (t1 - s1);

	      t2 += s2;

	      /* Renormalize (t1, t2)  */
	      head_tmp1 = t1 + t2;
	      tail_tmp1 = t2 - (head_tmp1 - t1);
	    }
	    c_i[cij] = head_tmp1;
	  }
	}

      }

      FPU_FIX_STOP;

      break;
    }
  }
}

