/* ---------------------------------------------------------------------
*
*  -- ScaLAPACK routine (version 1.0) --
*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
*     and University of California, Berkeley.
*     November 17, 1996
*
*  ---------------------------------------------------------------------
*/
/*
*  Include files
*/
#include "pblas.h"

void pcher2k_( uplo, trans, n, k, alpha, A, ia, ja, desc_A, B, ib, jb,
               desc_B, beta, C, ic, jc, desc_C )
/*
*  .. Scalar Arguments ..
*/
   F_CHAR      trans, uplo;
   int         * ia, * ib, * ic, * ja, * jb, * jc, * k, * n;
   float       * beta;
   complex     * alpha;
/* ..
*  .. Array Arguments ..
*/
   int         desc_A[], desc_B[], desc_C[];
   complex     A[], B[], C[];
{
/*
*  Purpose
*  =======
*
*  PCHER2K  performs one of the hermitian rank 2k operations
*
*  sub( C ) := alpha*sub( A )*conjg( sub( B ) )' +
*              conjg( alpha )*sub( B )*conjg( sub( A ) )' +
*              beta*sub( C ),
*
*  or
*
*  sub( C ) := alpha*conjg( sub( A )' )*sub( B ) +
*              conjg( alpha )*conjg( sub( B )' )*sub( A ) +
*              beta*sub( C ),
*
*  where sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1),
*
*        sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1)  if TRANS = 'N',
*                         A(IA:IA+K-1,JA:JA+N-1)  otherwise,
*
*        sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1)  if TRANS = 'N',
*                         B(IB:IB+K-1,JB:JB+N-1)  otherwise.
*
*  Alpha and beta are scalars with beta real, sub( C ) is an N-by-N
*  hermitian distributed matrix and sub( A ) and sub( B ) are N-by-K
*  distributed matrices in the first case and a K-by-N distributed
*  matrices in the second case.
*
*  Notes
*  =====
*
*  Each global data object is described by an associated description
*  vector.  This vector stores the information required to establish
*  the mapping between an object element and its corresponding process
*  and memory location.
*
*  Let A be a generic term for any 2D block cyclicly distributed array.
*  Such a global array has an associated description vector descA.
*  In the following comments, the character _ should be read as
*  "of the global array".
*
*  NOTATION        STORED IN      EXPLANATION
*  --------------- -------------- --------------------------------------
*  DT_A   (global) descA[ DT_ ]   The descriptor type.  In this case,
*                                 DT_A = 1.
*  CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating
*                                 the BLACS process grid A is distribu-
*                                 ted over. The context itself is glo-
*                                 bal, but the handle (the integer
*                                 value) may vary.
*  M_A    (global) descA[ M_ ]    The number of rows in the global
*                                 array A.
*  N_A    (global) descA[ N_ ]    The number of columns in the global
*                                 array A.
*  MB_A   (global) descA[ MB_ ]   The blocking factor used to distribu-
*                                 te the rows of the array.
*  NB_A   (global) descA[ NB_ ]   The blocking factor used to distribu-
*                                 te the columns of the array.
*  RSRC_A (global) descA[ RSRC_ ] The process row over which the first
*                                 row of the array A is distributed.
*  CSRC_A (global) descA[ CSRC_ ] The process column over which the
*                                 first column of the array A is
*                                 distributed.
*  LLD_A  (local)  descA[ LLD_ ]  The leading dimension of the local
*                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
*
*  Let K be the number of rows or columns of a distributed matrix,
*  and assume that its process grid has dimension p x q.
*  LOCr( K ) denotes the number of elements of K that a process
*  would receive if K were distributed over the p processes of its
*  process column.
*  Similarly, LOCc( K ) denotes the number of elements of K that a
*  process would receive if K were distributed over the q processes of
*  its process row.
*  The values of LOCr() and LOCc() may be determined via a call to the
*  ScaLAPACK tool function, NUMROC:
*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
*  An upper bound for these quantities may be computed by:
*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
*
*  according to a square block cyclic decomposition, i.e MB_C = NB_C, if
*  N+MOD(IC-1,MB_C) > MB_C or N+MOD(JC-1,NB_C) > NB_C, in which case
*  sub( C ) is not just contained into a block and IC-1 (resp. JC-1)
*  must be a multiple of MB_C (resp. NB_C).
*
*  If TRANS = 'N', then sub( A ), sub( B ) and sub( C ) must be row
*  aligned, i.e the row process having the first entries of sub( A )
*  must also own the first entries of sub( B ) and sub( C ).
*  If sub( C ) is not just contained into a block, IC-1 (resp. IB-1,
*  IA-1) must be a multiple of MB_C (resp. MB_B, MB_A) and the column
*  block size of sub( C ) must be equal to the row block size of A,
*  and the column block sizes of A and B must be equal, i.e NB_C = MB_A
*  and NB_A = NB_B.
*
*  Otherwise, then sub( A ), sub( B ) and sub( C ) must be column
*  aligned, i.e the column process having the first entries of sub( A )
*  must also own the first entries of sub( B ) and sub( C ).
*  If sub( C ) is not just contained into a block, JC-1 (resp. JB-1,
*  JA-1) must be a multiple of NB_C (resp. NB_B, NB_A) and the row
*  block size of sub( C ) must be equal to the column block size of A,
*  and the row block sizes of A and B must be equal, i.e MB_C = NB_A
*  and MB_A = MB_B.
*
*  Parameters
*  ==========
*
*  UPLO    (global input) pointer to CHARACTER
*          On  entry,   UPLO  specifies  whether  the  upper  or  lower
*          triangular  part  of the  distributed matrix sub( C ) is to
*          be  referenced  as  follows:
*
*          UPLO = 'U' or 'u' Only the upper triangular part of sub( C )
*                            is to be referenced,
*
*          UPLO = 'L' or 'l' Only the lower triangular part of sub( C )
*                            is to be referenced.
*
*  TRANS   (global input) pointer to CHARACTER
*          On entry,  TRANS  specifies the operation to be performed as
*          follows:
*
*          TRANS = 'N' or 'n'
*            sub( C ) := alpha*sub( A )*conjg( sub( B )' ) +
*                        conjg( alpha )*sub( B )*conjg( sub( A )' ) +
*                        beta*C,
*
*          TRANS = 'C' or 'c'
*            sub( C ) := alpha*conjg( sub( A )' )*sub( B ) +
*                        conjg( alpha )*conjg( sub( B )' )*sub( A ) +
*                        beta*sub( C ).
*
*  N       (global input) pointer to INTEGER
*          The order of the distributed matrix sub( C ). N >= 0.
*
*  K       (global input) pointer to INTEGER
*          On entry with  TRANS = 'N' or 'n', K specifies the number of
*          columns  of the distributed matrices sub( A ) and sub( B ),
*          and on entry with TRANS = 'C' or 'c', K specifies the number
*          of rows of the distributed matrices sub( A ) and sub( B ).
*          K >= 0.
*
*  A       (local input) COMPLEX pointer into the local memory
*          to an array of dimension (LLD_A, KLa), where KLa is
*          LOCc(JA+K-1) when  TRANS = 'N' or 'n',  and is LOCc(JA+N-1)
*          otherwise.  Before entry with TRANS = 'N' or 'n', this array
*          contains the local pieces of the distributed matrix sub( A ).
*
*  IA      (global input) pointer to INTEGER
*          The global row index of the submatrix of the distributed
*          matrix A to operate on.
*
*  JA      (global input) pointer to INTEGER
*          The global column index of the submatrix of the distributed
*          matrix A to operate on.
*
*  DESCA   (global and local input) INTEGER array of dimension 8.
*          The array descriptor of the distributed matrix A.
*
*  B       (local input) COMPLEX pointer into the local memory
*          to an array of dimension (LLD_B, KLb), where KLb is
*          LOCc(JB+K-1) when  TRANS = 'N' or 'n',  and is LOCc(JB+N-1)
*          otherwise.  Before entry with TRANS = 'N' or 'n', this array
*          contains the local pieces of the distributed matrix sub( B ).
*
*  IB      (global input) pointer to INTEGER
*          The global row index of the submatrix of the distributed
*          matrix B to operate on.
*
*  JB      (global input) pointer to INTEGER
*          The global column index of the submatrix of the distributed
*          matrix B to operate on.
*
*  DESCB   (global and local input) INTEGER array of dimension 8.
*          The array descriptor of the distributed matrix B.
*
*  BETA    (global input) pointer to REAL
*          On entry,  BETA  specifies the scalar beta.
*
*  C       (local input/local output) COMPLEX pointer into the
*          local memory to an array of dimension (LLD_C, LOCc(JC+N-1)).
*          Before entry with UPLO = 'U' or 'u', this array contains the
*          local pieces of the N-by-N upper triangular part of the
*          Hermitian distributed matrix sub( C ) and its strictly lower
*          triangular part is not referenced. On exit, the upper trian-
*          gular part of sub( C ) is overwritten by the upper triangular
*          part of the updated distributed matrix.
*          Before entry  with  UPLO = 'L' or 'l', this array contains the
*          local pieces of the N-by-N lower triangular part of the
*          Hermitian distributed matrix sub( C ) and its strictly upper
*          triangular part is not referenced. On exit, the lower trian-
*          gular part of sub( C ) is overwritten by the lower triangular
*          part of the updated distributed matrix.
*          Note that the imaginary parts of the diagonal elements of
*          sub( C ) need not be set, they are assumed to be zero, and
*          on exit they are set to zero.
*
*  IC      (global input) pointer to INTEGER
*          The global row index of the submatrix of the distributed
*          matrix C to operate on.
*
*  JC      (global input) pointer to INTEGER
*          The global column index of the submatrix of the distributed
*          matrix C to operate on.
*
*  DESCC   (global and local input) INTEGER array of dimension 8.
*          The array descriptor of the distributed matrix C.
*
*  =====================================================================
*
*  .. Local Scalars ..
*/
   char        * ctop, matblk, * rtop, TrA, UploA;
   int         block, i, iacol, iapos, iarow, ibcol, iblk, ibpos, ibrow,
               iccol, icoffa, icoffb, icoffc, icrow, ictxt, iia, iib,
               iic, in, info, iroffa, iroffb, iroffc, iszcmp, j, jblk,
               jja, jjb, jjc, jn, lcm, lcmp, lcmq, loop, mullen, mycol,
               myrow, nca, ncb, ncc, notran, np0, nprow, npcol, nq0,
               nra, nrb, nrc, szcmp, tmp0, tmp1, tmp2, tmp3, wksz;
   float       tbeta;
/* ..
*  .. PBLAS Buffer ..
*/
   complex     * buff;
/* ..
*  .. External Functions ..
*/
   void        blacs_gridinfo_();
   void        pbchkmat();
   void        pberror_();
   char        * getpbbuf();
   char        * ptop();
   F_VOID_FCT  pbcher2k_();
   F_INTG_FCT  ilcm_();
/* ..
*  .. Executable Statements ..
*
*  Get grid parameters
*/
   ictxt = desc_A[CTXT_];
   blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol );
/*
*  Test the input parameters
*/
   info = 0;
   if( nprow == -1 )
      info = -(900+CTXT_+1);
   else
   {
      UploA = Mupcase( F2C_CHAR( uplo )[0] );
      TrA = Mupcase( F2C_CHAR( trans )[0] );
      notran =  ( TrA == 'N' );
      iroffa = (*ia-1) % desc_A[MB_];
      icoffa = (*ja-1) % desc_A[NB_];
      iroffb = (*ib-1) % desc_B[MB_];
      icoffb = (*jb-1) % desc_B[NB_];
      iroffc = (*ic-1) % desc_C[MB_];
      icoffc = (*jc-1) % desc_C[NB_];
      if( notran )
      {
         pbchkmat( *n, 3, *k, 4, *ia, *ja, desc_A, 9, &iia, &jja,
                   &iarow, &iacol, nprow, npcol, myrow, mycol,
                   &nra, &nca, &info );
         pbchkmat( *n, 3, *k, 4, *ib, *jb, desc_B, 13, &iib, &jjb,
                   &ibrow, &ibcol, nprow, npcol, myrow, mycol,
                   &nrb, &ncb, &info );
         pbchkmat( *n, 3, *n, 3, *ic, *jc, desc_C, 18, &iic, &jjc,
                   &icrow, &iccol, nprow, npcol, myrow, mycol,
                   &nrc, &ncc, &info );
         block = ( ( ( (*n) + iroffc ) <= desc_C[MB_] ) &&
                   ( ( (*n) + icoffc ) <= desc_C[NB_] ) &&
                   ( ibcol == iacol ) );
         loop = ( ( ( (*k) + icoffa ) > desc_A[NB_] ) ||
                  ( ( (*k) + icoffb ) > desc_B[NB_] ) );

      }
      else
      {
         pbchkmat( *k, 4, *n, 3, *ia, *ja, desc_A, 9, &iia, &jja,
                   &iarow, &iacol, nprow, npcol, myrow, mycol,
                   &nra, &nca, &info );
         pbchkmat( *k, 4, *n, 3, *ib, *jb, desc_B, 13, &iib, &jjb,
                   &ibrow, &ibcol, nprow, npcol, myrow, mycol,
                   &nrb, &ncb, &info );
         pbchkmat( *n, 3, *n, 3, *ic, *jc, desc_C, 18, &iic, &jjc,
                   &icrow, &iccol, nprow, npcol, myrow, mycol,
                   &nrc, &ncc, &info );
         block = ( ( ( (*n) + iroffc ) <= desc_C[MB_] ) &&
                   ( ( (*n) + icoffc ) <= desc_C[NB_] ) &&
                   ( ibrow == iarow ) );
         loop = ( ( ( (*k) + iroffa ) > desc_A[MB_] ) ||
                  ( ( (*k) + iroffb ) > desc_B[MB_] ) );
      }
      if( info == 0 )
      {
         if( ( UploA != 'U' ) && ( UploA != 'L' ) )
            info = -1;
         else if( ( TrA != 'N' ) && ( TrA != 'C' ) )
            info = -2;
         if( block )
         {
            if( notran )
            {
               if( ( nprow != 1 ) && ( (*n)+iroffa > desc_A[MB_] ) )
                  info = -7;
               else if( ( nprow != 1 ) && ( (*n)+iroffb > desc_B[MB_] ) )
                  info = -11;
               else if( icrow != iarow )
                  info = -16;
               else if( icrow != ibrow )
                  info = -16;
            }
            else
            {
               if( ( npcol != 1 ) && ( (*n)+icoffa > desc_A[NB_] ) )
                  info = -8;
               else if( ( npcol != 1 ) && ( (*n)+icoffb > desc_B[NB_] ) )
                  info = -12;
               else if( iccol != iacol )
                  info = -17;
               else if( iccol != ibcol )
                  info = -17;
            }
         }
         else
         {
            if( notran )
            {
               if( iroffa != 0 )
                 info = -7;
               else if( iroffb != 0 )
                 info = -11;
               else if( icrow != iarow )
                  info = -16;
               else if( icrow != ibrow )
                  info = -16;
               else if( ( desc_A[MB_] != desc_C[NB_] ) ||
                        ( desc_B[MB_] != desc_C[NB_] ) )
                  info = -(1800+MB_+1);
            }
            else
            {
               if( icoffa != 0 )
                 info = -8;
               else if( icoffb != 0 )
                 info = -12;
               else if( iccol != iacol )
                  info = -17;
               else if( iccol != ibcol )
                  info = -17;
               else if( ( desc_A[NB_] != desc_C[MB_] ) ||
                        ( desc_B[NB_] != desc_C[MB_] ) )
                  info = -(1800+NB_+1);
            }
            if( iroffc != 0 )
               info = -16;
            else if( icoffc != 0 )
               info = -17;
            else if( desc_C[MB_] != desc_C[NB_] )
               info = -(1800+NB_+1);
         }
         if( notran )
         {
            if( ( loop ) && ( icoffa != icoffb ) )
               info = -12;
            else if( ( loop || !block ) && ( desc_A[NB_] != desc_B[NB_] ) )
               info = -(1300+NB_+1);
         }
         else
         {
            if( ( loop ) && ( iroffa != iroffb ) )
               info = -11;
            else if( ( loop || !block ) && ( desc_A[MB_] != desc_B[MB_] ) )
               info = -(1300+MB_+1);
         }
         if( ictxt != desc_B[CTXT_] )
            info = -(1300+CTXT_+1);
         else if( ictxt != desc_C[CTXT_] )
            info = -(1800+CTXT_+1);
      }
   }
   if( info )
   {
      pberror_( &ictxt, "PCHER2K", &info );
      return;
   }
/*
*  Quick return if possible.
*/
   if( ( *n == 0 ) ||
       ( ( ( ( alpha->re == ZERO ) && ( alpha->im == ZERO ) ) ||
           ( *k == 0                                        ) ) &&
           ( ( *beta == ONE )                               ) ) )
      return;
/*
*  Figure out the arguments to be passed to pbcher2k
*/
   mullen = MULLENFAC * desc_C[MB_];
   if( block )
   {
      matblk = 'B';
      wksz = (*n) * (*n);
      if( notran )
      {
         iapos = iacol;
         ibpos = ibcol;
      }
      else
      {
         iapos = iarow;
         ibpos = ibrow;
      }
   }
   else
   {
      matblk = 'M';
      lcm = ilcm_( &nprow, &npcol );
      lcmp = lcm / nprow;
      lcmq = lcm / npcol;
      tmp3 = lcmq * desc_C[MB_];
      iszcmp = CEIL( mullen, tmp3 );
      szcmp = iszcmp * iszcmp * lcmq*desc_C[MB_] * lcmp*desc_C[MB_];
      tmp0 = CEIL( (*n), desc_C[NB_] );
      if( notran )
      {
         iapos = iacol;
         ibpos = ibcol;
         tmp1 = (*n) / desc_C[MB_];
         nq0 = MYROC0( tmp1, *n, desc_C[MB_], npcol );
         tmp2 = nq0 / desc_C[NB_];
         tmp3 = desc_A[NB_] * MYROC0( tmp2, nq0, desc_C[NB_], lcmq ) *
                MIN( tmp0, lcmq );
         wksz = ( nq0 + 2*MYROC0( tmp1, *n, desc_C[MB_], nprow ) ) *
                desc_A[NB_] + MAX( szcmp, tmp3 );
      }
      else
      {
         iapos = iarow;
         ibpos = ibrow;
         tmp1 = (*n) / desc_C[MB_];
         np0 = MYROC0( tmp1, *n, desc_C[MB_], nprow );
         tmp2 = np0 / desc_C[MB_];
         tmp3 = desc_A[MB_] * MYROC0( tmp2, np0, desc_C[MB_], lcmp ) *
                MIN( tmp0, lcmp );
         wksz = ( np0 + 2*MYROC0( tmp1, *n, desc_C[MB_], npcol ) ) *
                desc_A[MB_] + MAX( szcmp, tmp3 );
      }
   }
  buff = (complex *)getpbbuf( "PCHER2K", wksz*sizeof(complex) );
/*
*  Call PB-BLAS routine
*/
   tbeta = ONE;
   if( block )
   {
      if( notran )
      {
         j = CEIL( (*ja), desc_A[NB_] ) * desc_A[NB_];
         jn = (*ja)+(*k)-1;
         jn = MIN( j, jn );
                                     /* Handle first block separately */
         jblk = jn-(*ja)+1;
         pbcher2k_( &ictxt, C2F_CHAR( &matblk ), uplo, trans, n, &jblk,
                    &desc_A[NB_], alpha, &A[iia-1+(jja-1)*desc_A[LLD_]],
                    &desc_A[LLD_], &B[iib-1+(jjb-1)*desc_B[LLD_]],
                    &desc_B[LLD_], beta, &C[iic-1+(jjc-1)*desc_C[LLD_]],
                    &desc_C[LLD_], &iapos, &ibpos, &icrow, &iccol,
                    C2F_CHAR( TOPDEF ), C2F_CHAR( NO ), C2F_CHAR( NO ),
                    C2F_CHAR( NO ), &mullen, buff );
         if( mycol == iapos )
         {
            jja += jblk;
            jja = MIN( jja, nca );
         }
         if( mycol == ibpos )
         {
            jjb += jblk;
            jjb = MIN( jjb, ncb );
         }
         iapos = (iapos+1) % npcol;
         ibpos = (ibpos+1) % npcol;
         jblk = (*k) - jblk;
         pbcher2k_( &ictxt, C2F_CHAR( &matblk ), uplo, trans, n, &jblk,
                    &desc_A[NB_], alpha, &A[iia-1+(jja-1)*desc_A[LLD_]],
                    &desc_A[LLD_], &B[iib-1+(jjb-1)*desc_B[LLD_]],
                    &desc_B[LLD_], &tbeta, &C[iic-1+(jjc-1)*desc_C[LLD_]],
                    &desc_C[LLD_], &iapos, &ibpos, &icrow, &iccol,
                    C2F_CHAR( TOPDEF ), C2F_CHAR( NO ), C2F_CHAR( NO ),
                    C2F_CHAR( NO ), &mullen, buff );
      }
      else
      {
         i = CEIL( (*ia), desc_A[MB_] ) * desc_A[MB_];
         in = (*ia)+(*k)-1;
         in = MIN( i, in );
                                     /* Handle first block separately */
         iblk = in-(*ia)+1;
         pbcher2k_( &ictxt, C2F_CHAR( &matblk ), uplo, trans, n, &iblk,
                    &desc_A[MB_], alpha, &A[iia-1+(jja-1)*desc_A[LLD_]],
                    &desc_A[LLD_], &B[iib-1+(jjb-1)*desc_B[LLD_]],
                    &desc_B[LLD_], beta, &C[iic-1+(jjc-1)*desc_C[LLD_]],
                    &desc_C[LLD_], &iapos, &ibpos, &icrow, &iccol,
                    C2F_CHAR( TOPDEF ), C2F_CHAR( NO ), C2F_CHAR( NO ),
                    C2F_CHAR( NO ), &mullen, buff );
         if( myrow == iapos )
         {
            iia += iblk;
            iia = MIN( iia, nra );
         }
         if( myrow == ibpos )
         {
             iib += iblk;
             iib = MIN( iib, nrb );
         }
         iapos = (iapos+1) % nprow;
         ibpos = (ibpos+1) % nprow;
         iblk = (*k) - iblk;
         pbcher2k_( &ictxt, C2F_CHAR( &matblk ), uplo, trans, n, &iblk,
                    &desc_A[MB_], alpha, &A[iia-1+(jja-1)*desc_A[LLD_]],
                    &desc_A[LLD_], &B[iib-1+(jjb-1)*desc_B[LLD_]],
                    &desc_B[LLD_], &tbeta, &C[iic-1+(jjc-1)*desc_C[LLD_]],
                    &desc_C[LLD_], &iapos, &ibpos, &icrow, &iccol,
                    C2F_CHAR( TOPDEF ), C2F_CHAR( NO ), C2F_CHAR( NO ),
                    C2F_CHAR( NO ), &mullen, buff );
      }
   }
   else
   {
      if( notran )
      {
         rtop = ptop( BROADCAST, ROW, TOPGET );
         j = CEIL( (*ja), desc_A[NB_] ) * desc_A[NB_];
         jn = (*ja)+(*k)-1;
         jn = MIN( j, jn );
                                     /* Handle first block separately */
         jblk = jn-(*ja)+1;
         pbcher2k_( &ictxt, C2F_CHAR( &matblk ), uplo, trans, n, &jblk,
                    &desc_A[MB_], alpha, &A[iia-1+(jja-1)*desc_A[LLD_]],
                    &desc_A[LLD_], &B[iib-1+(jjb-1)*desc_B[LLD_]],
                    &desc_B[LLD_], beta, &C[iic-1+(jjc-1)*desc_C[LLD_]],
                    &desc_C[LLD_], &iapos, &ibpos, &icrow, &iccol,
                    C2F_CHAR( rtop ), C2F_CHAR( NO ), C2F_CHAR( NO ),
                    C2F_CHAR( NO ), &mullen, buff );
         if( mycol == iapos )
         {
            jja += jblk;
            jja = MIN( jja, nca );
         }
         if( mycol == ibpos )
         {
            jjb += jblk;
            jjb = MIN( jjb, ncb );
         }
         iapos = (iapos+1) % npcol;
         ibpos = (ibpos+1) % npcol;
                              /* loop over remaining block of columns */
         tmp0 = (*ja)+(*k)-1;
         for( j=jn+1; j<=tmp0; j+=desc_A[NB_] )
         {
             jblk = (*k)-j+(*ja);
             jblk = MIN( desc_A[NB_], jblk );
             pbcher2k_( &ictxt, C2F_CHAR( &matblk ), uplo, trans, n,
                        &jblk, &desc_A[MB_], alpha,
                        &A[iia-1+(jja-1)*desc_A[LLD_]], &desc_A[LLD_],
                        &B[iib-1+(jjb-1)*desc_B[LLD_]], &desc_B[LLD_],
                        &tbeta,
                        &C[iic-1+(jjc-1)*desc_C[LLD_]], &desc_C[LLD_],
                        &iapos, &ibpos, &icrow, &iccol, C2F_CHAR( rtop ),
                        C2F_CHAR( NO ), C2F_CHAR( NO ), C2F_CHAR( NO ),
                        &mullen, buff );
             if( mycol == iapos )
             {
                jja += jblk;
                jja = MIN( jja, nca );
             }
             if( mycol == ibpos )
             {
                jjb += jblk;
                jjb = MIN( jjb, ncb );
             }
             iapos = (iapos+1) % npcol;
             ibpos = (ibpos+1) % npcol;
         }
      }
      else
      {
         ctop = ptop( BROADCAST, COLUMN, TOPGET );
         i = CEIL( (*ia), desc_A[MB_] ) * desc_A[MB_];
         in = (*ia)+(*k)-1;
         in = MIN( i, in );
                                     /* Handle first block separately */
         iblk = in-(*ia)+1;
         pbcher2k_( &ictxt, C2F_CHAR( &matblk ), uplo, trans, n, &iblk,
                    &desc_A[NB_], alpha, &A[iia-1+(jja-1)*desc_A[LLD_]],
                    &desc_A[LLD_], &B[iib-1+(jjb-1)*desc_B[LLD_]],
                    &desc_B[LLD_], beta, &C[iic-1+(jjc-1)*desc_C[LLD_]],
                    &desc_C[LLD_], &iapos, &ibpos, &icrow, &iccol,
                    C2F_CHAR( ctop ), C2F_CHAR( NO ), C2F_CHAR( NO ),
                    C2F_CHAR( NO ), &mullen, buff );
         if( myrow == iapos )
         {
            iia += iblk;
            iia = MIN( iia, nra );
         }
         if( myrow == ibpos )
         {
            iib += iblk;
            iib = MIN( iib, nrb );
         }
         iapos = (iapos+1) % nprow;
         ibpos = (ibpos+1) % nprow;
                                 /* loop over remaining block of rows */
         tmp0 = (*ia)+(*k)-1;
         for( i=in+1; i<=tmp0; i+=desc_A[MB_] )
         {
             iblk = *k-i+(*ia);
             iblk = MIN( desc_A[MB_], iblk );
             pbcher2k_( &ictxt, C2F_CHAR( &matblk ), uplo, trans, n,
                        &iblk, &desc_A[NB_], alpha,
                        &A[iia-1+(jja-1)*desc_A[LLD_]], &desc_A[LLD_],
                        &B[iib-1+(jjb-1)*desc_B[LLD_]], &desc_B[LLD_],
                        &tbeta,
                        &C[iic-1+(jjc-1)*desc_C[LLD_]], &desc_C[LLD_],
                        &iapos, &ibpos, &icrow, &iccol, C2F_CHAR( ctop ),
                        C2F_CHAR( NO ), C2F_CHAR( NO ), C2F_CHAR( NO ),
                        &mullen, buff );
             if( myrow == iapos )
             {
                iia += iblk;
                iia = MIN( iia, nra );
             }
             if( myrow == ibpos )
             {
                iib += iblk;
                iib = MIN( iib, nrb );
             }
             iapos = (iapos+1) % nprow;
             ibpos = (ibpos+1) % nprow;
         }
      }
   }
}
