/*
 *	Ohio Trollius
 *	Copyright 1997 The Ohio State University
 *	NJN/RBD
 *
 *	$Id: lamdtype.c,v 6.1.1.1 97/02/24 18:41:16 nevin Exp $
 *
 *	Function:	- miscellaneous and conversion datatype functions
 */

#include <errno.h>
#include <stdlib.h>

#include <mpi.h>
#include <mpisys.h>
#include <mpitrace.h>
#include <mpi_types.h>
#include <rpisys.h>

/*
 * global functions
 */
int			lam_type_free();
int			lam_dtsndrcv();
void			lam_dtcpy();
void			lam_dtblock();
void			lam_dtalign();
int			lam_dtbuffer();
void			lam_copyrev2();
extern void		lam_copyrev4();
extern void		lam_copyrev8();
extern void		lam_copyrevn();

/*
 * external functions
 */
extern void		lam_tr_dtypefree();

/*
 * local functions
 */
static void		cpy_hvector();
static void		cpy_hindexed();
static void		cpy_struct();


/*
 *	lam_type_free
 *
 *	Function:	- really free a datatype
 *	Accepts:	- datatype
 *	Returns:	- 0 or LAMERROR
 */
int
lam_type_free(dt)

MPI_Datatype		dt;

{
	int		err;			/* error code */
	int		i;
/*
 * Recursively free all datatype children.
 */
	switch(dt->dt_format) {

	case LAM_DTCONTIG:
	case LAM_DTVECTOR:
	case LAM_DTHVECTOR:
	case LAM_DTINDEXED:
	case LAM_DTHINDEXED:

	if (!(dt->dt_dtype->dt_flags & LAM_PREDEF)) {
		err = MPI_Type_free(&(dt->dt_dtype));
		if (err != MPI_SUCCESS) return(err);
	}

	if (dt->dt_format == LAM_DTHINDEXED || dt->dt_format == LAM_DTINDEXED) {
		if (dt->dt_disps) free((char *) dt->dt_disps);
	}

	break;

	case LAM_DTSTRUCT:

	for (i = 0; i < dt->dt_count; ++i) {
		if (!(dt->dt_dtypes[i]->dt_flags & LAM_PREDEF)) {
			err = MPI_Type_free(&(dt->dt_dtypes[i]));
			if (err != MPI_SUCCESS) return(err);
		}
	}

	if (dt->dt_dtypes) free((char *) dt->dt_dtypes);
	break;

	case LAM_DTBASIC:
		return(lam_mkerr(MPI_ERR_TYPE, 0));

	default:
		return(lam_mkerr(MPI_ERR_ARG, 0));
	}
/*
 * Free the top datatype.
 */
	lam_tr_dtypefree(dt);
	if (dt->dt_label > TRDTMAX) {
		free((char *) dt);
	}

	return(MPI_SUCCESS);
}

/*
 *	lam_dtsndrcv
 *
 *	Function:	- copy MPI message from buffer into another
 *			- send/recv done if cannot optimize
 *	Accepts:	- send buffer
 *			- send count
 *			- send datatype
 *			- receive buffer
 *			- receive count
 *			- receive datatype
 *			- tag
 *			- communicator
 *	Returns:	- MPI_SUCCESS or error code
 */
int
lam_dtsndrcv(sbuf, scount, sdtype, rbuf, rcount, rdtype, tag, comm)

void			*sbuf;
int			scount;
MPI_Datatype		sdtype;
void			*rbuf;
int			rcount;
MPI_Datatype		rdtype;
int			tag;
MPI_Comm		comm;

{
	int		err;			/* error code */
	int		size;			/* packed size */
	int		rank;			/* caller's rank */
	MPI_Status	stat;			/* status info */

	err = lam_mkerr(MPI_ERR_TRUNCATE, 0);
/*
 * If same datatypes used, just copy.
 */
	if (sdtype == rdtype) {
		if (scount <= rcount) {
			lam_dtcpy((char *) rbuf, (char *) sbuf, rcount,
					rdtype);
			err = MPI_SUCCESS;
		}
	}
/*
 * If receive packed.
 */
	else if (rdtype == MPI_PACKED) {
		MPI_Pack_size(scount, sdtype, MPI_COMM_WORLD, &size);
		if (size <= rcount) {
			if (lam_pack(sbuf, scount, sdtype,
						rbuf, rcount) == size) {
				err = MPI_SUCCESS;
			}
		}
	}
/*
 * If send packed.
 */
	else if (sdtype == MPI_PACKED) {
		MPI_Pack_size(rcount, rdtype, MPI_COMM_WORLD, &size);
		if (size >= scount) {
			if (lam_unpack(sbuf, scount,
					rbuf, rcount, rdtype) == scount) {
				err = MPI_SUCCESS;
			}
		}
	}
/*
 * Let send/recv handle it.
 */
	else {
		MPI_Comm_rank(comm, &rank);
		err = MPI_Sendrecv(sbuf, scount, sdtype, rank, tag,
				rbuf, rcount, rdtype, rank, tag, comm, &stat);
	}

	return(err);
}

/*
 *	lam_dtcpy
 *
 *	Function:	- copy a datatype from a buffer into another
 *			- copying is done recursively
 *	Accepts:	- destination buffer
 *			- source buffer
 *			- count
 *			- datatype
 */
void
lam_dtcpy(dest, src, count, dtype)

char			*dest;
char			*src;
int			count;
MPI_Datatype		dtype;

{
/*
 * If no packing or extent adjustment is required do a direct copy.
 * This is always the case for basic types.
 */
	if ((dtype->dt_flags & LAM_DTNOPACK)
	    	&& ((count == 1) || (dtype->dt_flags & LAM_DTNOXADJ))) {
	    memcpy(dest, src, count * dtype->dt_size);
	}
	else {
	    switch(dtype->dt_format) {

	    case LAM_DTCONTIG:
		lam_dtcpy(dest, src, count * dtype->dt_count, dtype->dt_dtype);
		break;

	    case LAM_DTVECTOR:
		cpy_hvector(dest, src, count, dtype, 1);
		break;

	    case LAM_DTHVECTOR:
		cpy_hvector(dest, src, count, dtype, 0);
		break;

	    case LAM_DTINDEXED:
		cpy_hindexed(dest, src, count, dtype, 1);
		break;

	    case LAM_DTHINDEXED:
		cpy_hindexed(dest, src, count, dtype, 0);
		break;

	    case LAM_DTSTRUCT:
		cpy_struct(dest, src, count, dtype);
		break;
	    }
	}
}

/*
 *	cpy_hvector
 *
 *	Function:	- copy vector or hvector
 */
static void
cpy_hvector(dest, src, num, dtype, fl_vec)

char			*dest;
char			*src;
int			num;
MPI_Datatype		dtype;
int			fl_vec;

{
	MPI_Datatype	subtype;		/* type of vector elements */
	int		blksize;		/* size of block */
	int		count;			/* number of blocks */
	int		extent;			/* datatype extent */
	int		stride;			/* stride in bytes */
	int		i, j;

	subtype = dtype->dt_dtype;
	extent = dtype->dt_upper - dtype->dt_lower;
	count = dtype->dt_count;

	stride = dtype->dt_stride;
	if (fl_vec) {
	    stride *= subtype->dt_upper - subtype->dt_lower;
	}

	blksize = dtype->dt_length * subtype->dt_size;

	if ((subtype->dt_flags & LAM_DTNOPACK) && ((dtype->dt_length == 1)
	    	|| (subtype->dt_flags & LAM_DTNOXADJ))) {
/*
 * The subtype blocks are contiguous so just loop through the vector
 * copying them.  If the buffers are sufficiently aligned, the blocks
 * are of size 4 or 8, and the stride is a multiple of the block size
 * then assignment is used to eliminate memcpy overhead.  
 */
	    if (blksize == 4 && ALIGNEDU4(stride)
		    	&& ALIGNEDU4(dest) && ALIGNEDU4(src)) {

		uint4	*d, *s;
		
		stride /= 4;

		for (i = 0; i < num; ++i, src += extent, dest += extent) {
		    s = (uint4 *) src;
		    d = (uint4 *) dest;

		    for (j = 0; j < count; ++j) {
			d[j * stride] = s[j * stride];
		    }
		}
	    }
	    else if (blksize == 8 && ALIGNEDF8(stride)
		    	&& ALIGNEDF8(dest) && ALIGNEDF8(src)) {

		float8	*d, *s;

		stride /= 8;

		for (i = 0; i < num; ++i, src += extent, dest += extent) {
		    s = (float8 *) src;
		    d = (float8 *) dest;
		    
		    for (j = 0; j < count; ++j) {
			d[j * stride] = s[j * stride];
		    }
		}
	    }
	    else {
		for (i = 0; i < num; ++i, src += extent, dest += extent) {
		    for (j = 0; j < count; ++j) {
			memcpy(dest + j * stride, src + j * stride, blksize);
		    }
		}
	    }
	}
/*
 * Subtype blocks are not contiguous and need to be recursively copied.
 */
	else {
	    for (i = 0; i < num; ++i, src += extent, dest += extent) {
		for (j = 0; j < count; ++j) {
		    lam_dtcpy(dest + j * stride, src + j * stride,
				dtype->dt_length, subtype);
		}
	    }
	}
}

/*
 *	cpy_hindexed
 *
 *	Function:	- copy indexed or hindexed
 */
static void
cpy_hindexed(dest, src, num, dtype, fl_idx)

char			*dest;
char			*src;
int			num;
MPI_Datatype		dtype;
int			fl_idx;

{
	MPI_Aint	*disp;			/* ptr displacements */
	int		*len;			/* ptr lengths */
	int		extent;			/* datatype extent */
	int		factor;			/* extent factor */
	int		i, j;

	extent = dtype->dt_upper - dtype->dt_lower;
	factor = (fl_idx == 0) ? 1 :
			dtype->dt_dtype->dt_upper - dtype->dt_dtype->dt_lower;

	for (i = 0; i < num; ++i, src += extent, dest += extent) {

	    len = dtype->dt_lengths;
	    disp = dtype->dt_disps;

	    for (j = 0; j < dtype->dt_count; ++j, ++len, ++disp) {

		lam_dtcpy(dest + (*disp * factor),
			    src + (*disp * factor), *len, dtype->dt_dtype);
	    }
	}
}

/*
 *	cpy_struct
 *
 *	Function:	- copy struct
 */
static void
cpy_struct(dest, src, num, dtype)

char			*dest;
char			*src;
int			num;
MPI_Datatype		dtype;

{
	MPI_Aint	*disp;			/* ptr displacements */
	MPI_Datatype	*type;			/* ptr datatypes */
	int		*len;			/* ptr lengths */
	int		extent;			/* datatype extent */
	int		i, j;

	extent = dtype->dt_upper - dtype->dt_lower;

	for (i = 0; i < num; ++i, src += extent, dest += extent) {

		len = dtype->dt_lengths;
		disp = dtype->dt_disps;
		type = dtype->dt_dtypes;

		for (j = 0; j < dtype->dt_count; ++j, ++len, ++disp, ++type) {

			lam_dtcpy(dest + *disp, src + *disp, *len, *type);
		}
	}
}

/*
 *	lam_dtblock
 *
 *	Function:	- add data block to datatype
 *			- blocks must be added in the order they appear in
 *			  the datatype contructor
 *	Accepts:	- datatype the block is being added to
 *			- block datatype
 *			- block count
 *			- block displacement
 */
void
lam_dtblock(new, old, count, disp)

MPI_Datatype		new;
MPI_Datatype		old;
int			count;
int			disp;

{
	int		extent;			/* extent of the block */
	int		upper;			/* upper bound of new type */
	int		lower;			/* lower bound of new type */
	int		dataup;			/* new type data upper limit */
	int		datalow;		/* new type data lower limit */
	int		pack;			/* do we need to pack? */
/*
 * Determine the new bounds.
 */
	if (count > 0) {
		extent = (old->dt_upper - old->dt_lower) * (count - 1);

		if (extent > 0) {
			upper = old->dt_upper + extent + disp;
			dataup = old->dt_dataup + extent + disp;
			lower = old->dt_lower + disp;
			datalow = old->dt_datalow + disp;
		} else {
			upper = old->dt_upper + disp;
			dataup = old->dt_dataup + disp;
			lower = old->dt_lower + extent + disp;
			datalow = old->dt_datalow + extent + disp;
		}
	} else {
		upper = dataup = 0;
		lower = datalow = 0;
	}
/*
 * First block in the new type.
 */
	if (new->dt_size < 0) {

		new->dt_upper = upper;
		new->dt_dataup = dataup;
		new->dt_lower = lower;
		new->dt_datalow = datalow;

		new->dt_flags = old->dt_flags & (LAM_DTHASUB | LAM_DTHASLB);
		new->dt_size = count * old->dt_size;
		new->dt_nelem = count * old->dt_nelem;

		if (count > 0) {
			new->dt_flags |= (LAM_DTLOWSET | LAM_DTHIGHSET);
		}

		if ((old->dt_flags & LAM_DTNOPACK)
				&& (dataup - datalow == new->dt_size)) {
			new->dt_flags |= LAM_DTNOPACK;
		}
	}
/*
 * Adding a new non-null block.
 */
	else if (count > 0) {

		if (old->dt_size > 0) {

			pack = !(old->dt_flags & LAM_DTNOPACK);

			if (new->dt_size == 0) {
				new->dt_dataup = dataup;
				new->dt_datalow = datalow;
			} else {
				pack = pack || (new->dt_dataup != datalow);

				if (dataup > new->dt_dataup) {
					new->dt_dataup = dataup;
				}
				if (datalow < new->dt_datalow) {
					new->dt_datalow = datalow;
				}
			}

			new->dt_size += count * old->dt_size;
			new->dt_nelem += count * old->dt_nelem;

			if (pack) {
				new->dt_flags &= ~LAM_DTNOPACK;
			}
		}
/*
 * Update the upper bound.
 */
		if (old->dt_flags & LAM_DTHASUB) {
			if (new->dt_flags & LAM_DTHASUB) {
				if (upper > new->dt_upper) {
					new->dt_upper = upper;
				}
			} else {
				new->dt_flags |= LAM_DTHASUB;
				new->dt_upper = upper;
			}
			new->dt_flags |= LAM_DTHIGHSET;
		} else {
			if (!(new->dt_flags & LAM_DTHASUB)
					&& (old->dt_size > 0)) {
				new->dt_upper = new->dt_dataup;
				new->dt_flags |= LAM_DTHIGHSET;
			}
		}
/*
 * Update the lower bound.
 */
		if (old->dt_flags & LAM_DTHASLB) {
			if (new->dt_flags & LAM_DTHASLB) {
				if (lower < new->dt_lower) {
					new->dt_lower = lower;
				}
			} else {
				new->dt_flags |= LAM_DTHASLB;
				new->dt_lower = lower;
			}
			new->dt_flags |= LAM_DTLOWSET;
		} else {
			if (!(new->dt_flags & LAM_DTHASLB)
					&& (old->dt_size > 0)) {
				new->dt_lower = new->dt_datalow;
				new->dt_flags |= LAM_DTLOWSET;
			}
		}
/*
 * This ugliness is required to take care of pathological cases like
 * creating a datatype with a type map {(10, MPI_UB)}.  In this
 * case the lower bound must also be set to 10.
 */
		if ((new->dt_flags & LAM_DTLOWSET)
				&& !(new->dt_flags & LAM_DTHIGHSET)) {
			new->dt_upper = new->dt_lower;
			new->dt_flags |= LAM_DTHIGHSET;
		}
		else if (!(new->dt_flags & LAM_DTLOWSET)
				&& (new->dt_flags & LAM_DTHIGHSET)) {
			new->dt_lower = new->dt_upper;
			new->dt_flags |= LAM_DTLOWSET;
		}
	}
}

/*
 *	lam_dtalign
 *
 *	Function:	- adjust extent of datatype to account for alignment
 *	Accepts:	- datatype
 */
void
lam_dtalign(dtype)

MPI_Datatype		dtype;

{
	int		extent;			/* extent of datatype */
	int		epsilon;		/* extra needed for alignment */
/*
 * No adjustment may be done if MPI_UB is part of the type map.
 */
	if (!(dtype->dt_flags & LAM_DTHASUB)) {

		extent = dtype->dt_upper - dtype->dt_lower;

		if (extent >= 0) {
			epsilon = extent % dtype->dt_align;
			if (epsilon != 0) {
				epsilon = dtype->dt_align - epsilon;
			}
		}
		else {
			epsilon = (-extent) % dtype->dt_align;
		}

		dtype->dt_upper += epsilon;
	}
/*
 * Set the no extent adjustment flag if the upper and lower bounds match
 * exactly the upper and lower limits of the data.
 */
	if (dtype->dt_upper == dtype->dt_dataup
			&& dtype->dt_lower == dtype->dt_datalow) {
		dtype->dt_flags |= LAM_DTNOXADJ;
	}
}

/*
 *	lam_dtbuffer
 *
 *	Function:	- allocate buffer for copies of datatype
 *	Accepts:	- datatype
 *			- count of copies
 *			- buffer (out)
 *			- origin of buffer (out)
 *	Returns:	- MPI_SUCCESS or error code
 */
int
lam_dtbuffer(dtype, count, buffer, origin)

MPI_Datatype		dtype;
int			count;
char			**buffer;
char			**origin;

{
	int		extent;			/* space required for extent */
	int		up;			/* upper limit */
	int		low;			/* lower limit */

	if (count <= 0) {
		*buffer = *origin = 0;
		return(MPI_SUCCESS);
	}

	extent = (dtype->dt_upper - dtype->dt_lower) * (count - 1);

	if (extent >= 0) {
		up = dtype->dt_dataup + extent;
		low = dtype->dt_datalow;
	} else {
		up = dtype->dt_dataup;
		low = dtype->dt_datalow + extent;
	}

	if (up > low) {
		*buffer = malloc((unsigned) (up - low));
		if (*buffer == 0) {
			return(lam_mkerr(MPI_ERR_OTHER, errno));
		}
		*origin = *buffer - low;
	} else {
		*buffer = *origin = 0;
	}

	return(MPI_SUCCESS);
}

/*
 *	lam_copyrev2
 *
 *	Function:	- copy and reverse byte ordering of an
 *			  array of 2-byte elements
 *			- the source and destination arrays must not overlap
 *	Accepts:	- destination array
 *			- source array
 *			- # elements in array
 */
void
lam_copyrev2(dest, src, num)

void			*dest;
void			*src;
int			num;

{
	unsigned char	*d, *s;
	int		i;

	d = dest;
	s = src;
	num *= 2;

	for (i = 0; i < num; i += 2) {
		d[i] = s[i + 1];
		d[i + 1] = s[i];
	}
}

/*
 *	lam_copyrev4
 *
 *	Function:	- copy and reverse byte ordering of an
 *			  array of 4-byte elements
 *			- the source and destination arrays must not overlap
 *	Accepts:	- destination array
 *			- source array
 *			- # elements in array
 */
void
lam_copyrev4(dest, src, num)

void			*dest;
void			*src;
int			num;

{
#ifdef LAM_USE_SHIFTING
	int		i;
	uint4		old;
	uint4		*s, *d;

	d = dest;
	s = src;

	for (i = 0; i < num; ++i, ++s, ++d) {
		old = *s;
		*d = (old & 0x000000FF) << 16;
		*d = (*d | (old & 0x0000FF00)) << 8;
		old >>= 8;
		*d |= (old & 0x0000FF00);
		old >>= 16;
		*d |= (old & 0x000000FF);
	}
#else
        unsigned char   *d, *s;
        int             i;

        d = dest;
        s = src;
        num *= 4;

        for (i = 0; i < num; i += 4) {
                d[i] = s[i + 3];
                d[i + 1] = s[i + 2];
                d[i + 2] = s[i + 1];
                d[i + 3] = s[i];
        }
#endif	
}

/*
 *	lam_copyrev8
 *
 *	Function:	- copy and reverse byte ordering of an
 *			  array of 8-byte elements
 *			- the source and destination arrays must not overlap
 *	Accepts:	- destination array
 *			- source array
 *			- # elements in array
 */
void
lam_copyrev8(dest, src, num)

void			*dest;
void			*src;
int			num;

{
	unsigned char	*d, *s;
	int		i;

	d = dest;
	s = src;
	num *= 8;

	for (i = 0; i < num; i += 8) {
		d[i] = s[i + 7];
		d[i + 1] = s[i + 6];
		d[i + 2] = s[i + 5];
		d[i + 3] = s[i + 4];
		d[i + 4] = s[i + 3];
		d[i + 5] = s[i + 2];
		d[i + 6] = s[i + 1];
		d[i + 7] = s[i];
	}
}

/*
 *	lam_copyrevn
 *
 *	Function:	- copy and reverse byte ordering of an
 *			  array of n-byte elements
 *			- the source and destination arrays must not overlap
 *	Accepts:	- destination array
 *			- source array
 *			- element size
 *			- # elements in array
 */
void
lam_copyrevn(dest, src, nbytes, num)

void			*dest;
void			*src;
int			nbytes;
int			num;

{
	unsigned char	*d, *s;
	int		i, j;

	s = src;
	num *= nbytes;

	for (i = 0; i < num; i += nbytes) {

		d = (unsigned char *) dest + i + nbytes - 1;
		
		for (j = nbytes ; j > 0; --j, ++s, --d) {
			*d = *s;
		}
	}
}
