/*
 * addblkf.cc --
 *
 *      FIXME: This file needs a description here.
 *
 * Copyright (c) 1996-2002 The Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * A. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * B. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * C. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <osfcn.h>
#include <sys/types.h>
#include "endian.h"
#define UCLIMIT(x) ((t = (x)), (t &= ~(t>>31)), (t | ~((t-256) >> 31)))
#if BYTE_ORDER == LITTLE_ENDIAN
#define SPLICE_PIXEL(o, pix, pos) ((o) |= (pix) << (24 - (pos)))
#else
#define SPLICE_PIXEL(o, pix, pos) ((o) |= (pix) << (pos))
#endif

void addblkf_good(int dc, short* blk, u_char* in,
		  u_char* out, u_int stride)
{
	int t[64];
	/* corners */
	t[0] = in[0] << 4;
	t[7] = in[7] << 4;
	t[56] = in[7 * stride] << 4;
	t[63] = in[7 * stride + 7] << 4;
	/* edges */
	for (int i = 1; i < 7; ++i) {
		t[i] = (in[i - 1] + (in[i] << 1) + in[i + 1]) << 2;
		t[i * 8] = (in[(i - 1) * stride] +
			    (in[i * stride] << 1) +
			    in[(i + 1) * stride]) << 2;
		t[7 * 8 + i] = (in[7 * stride + i - 1] +
				(in[7 * stride + i] << 1) +
				in[7 * stride + i + 1]) << 2;
		t[i * 8 + 7] = (in[(i - 1) * stride + 7] +
				(in[i * stride + 7] << 1) +
				in[(i + 1) * stride + 7]) << 2;
	}
	/* internal */
	for (i = 1; i < 7; ++i)  {
		for (int j = 1; j < 7; ++j) {
			t[i * 8 + j] =
				in[(i - 1) * stride + j - 1] +
				(in[(i - 1) * stride + j] << 1) +
				in[(i - 1) * stride + j + 1] +
				(in[i * stride + j - 1] << 1) +
				(in[i * stride + j] << 2) +
				(in[i * stride + j + 1] << 1) +
				in[(i + 1) * stride + j - 1] +
				(in[(i + 1) * stride + j] << 1) +
				in[(i + 1) * stride + j + 1];
		}
	}
	int* p = t;
	for (i = 0; i < 8; i++) {
		for (int j = 0; j < 8; j++) {
			int t;
			int v = *p++;
			v += 8;
			v >>= 4;
			v += *blk++ + dc;
			out[j] = UCLIMIT(v);
		}
		out += stride;
	}
}

void addblkf(int dc, short* blk, u_char* in,
	     u_char* out, u_int stride)
{
	int t;

	/* Corner pixel has filter coef 1 */
	u_int s = in[0];
	u_int o = 0;
	SPLICE_PIXEL(o, UCLIMIT(s + blk[0] + dc) & 0xff, 24);

	u_int r00 = s << 24 | in[1] << 16 | in[2] << 8 | in[3];
	u_int r01 = in[4] << 24 | in[5] << 16 | in[6] << 8 | in[7];
	in += stride;

	/*
	 * First row.
	 */
	s += (r00 >> 15) & 0x1fe;
	s += (r00 >> 8) & 0xff;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[1] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 16);

	s = (r00 >> 16) & 0xff;
	s += (r00 >> 7) & 0x1fe;
	s += r00 & 0xff;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[2] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 8);

	s = (r00 >> 8) & 0xff;
	s += (r00 & 0xff) << 1;
	s += r01 >> 24;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[3] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 0);
	*(u_int*)out = o;

	s = r00 & 0xff;
	s += (r01 >> 23) & 0x1fe;
	s += (r01 >> 16) & 0xff;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[4] + dc) & 0xff;
	o = 0;
	SPLICE_PIXEL(o, s, 24);

	s = r01 >> 24;
	s += (r01 >> 15) & 0x1fe;
	s += (r01 >> 8) & 0xff;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[5] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 16);

	s = (r01 >> 16) & 0xff;
	s += (r01 >> 7) & 0x1fe;
	s += r01 & 0xff;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[6] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 8);

	/* corner has filter coef 1 */
	s = r01 & 0xff;
	s = UCLIMIT(s + blk[7] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 0);
	*(u_int*)(out + 4) = o;
	out += stride;
	blk += 8;

	/* load next rows into cache */
	u_int r10 = in[0] << 24 | in[1] << 16 | in[2] << 8 | in[3];
	u_int r11 = in[4] << 24 | in[5] << 16 | in[6] << 8 | in[7];
	in += stride;

	u_int r20, r21;
	u_int mask = 0xff00ff;
	for (int k = 6; --k >= 0; ) {
		/* load next row */
		r20 = in[0] << 24 | in[1] << 16 | in[2] << 8 | in[3];
		r21 = in[4] << 24 | in[5] << 16 | in[6] << 8 | in[7];
		in += stride;

		/* columns 0,2 */
		u_int v = (r00 >> 8) & mask;
		v += ((r10 >> 8) & mask) << 1;
		v += (r20 >> 8) & mask;

		/* first pixel */
		s = v >> 16;
		/* round */
		s += 2;
		s >>= 2;
		s = UCLIMIT(s + blk[0] + dc) & 0xff;
		o = 0;
		SPLICE_PIXEL(o, s, 24);

		/* columns 1,3 */
		u_int w = r00 & mask;
		w += (r10 & mask) << 1;
		w += r20 & mask;

		/* row */
		s = v >> 16;
		s += v & 0xffff;
		s += w >> (16-1);
		/* round */
		s += 8;
		s >>= 4;
		s = UCLIMIT(s + blk[1] + dc) & 0xff;
		SPLICE_PIXEL(o, s, 16);

		s = w >> 16;
		s += w & 0xffff;
		s += (v & 0xffff) << 1;
		/* round */
		s += 8;
		s >>= 4;
		s = UCLIMIT(s + blk[2] + dc) & 0xff;
		SPLICE_PIXEL(o, s, 8);

		/* start next row */
		s = v & 0xffff;
		s += (w & 0xffff) << 1;
		/* but first do columns 4,6 */
		v = (r01 >> 8) & mask;
		v += ((r11 >> 8) & mask) << 1;
		v += (r21 >> 8) & mask;
		/* finish row */
		s += v >> 16;
		/* round */
		s += 8;
		s >>= 4;
		s = UCLIMIT(s + blk[3] + dc) & 0xff;
		SPLICE_PIXEL(o, s, 0);
		*(u_int*)out = o;

		/* start next row */
		s = w & 0xffff;
		s += (v >> 16) << 1;
		/* but first do columns 5,7 */
		w = r01 & mask;
		w += (r11 & mask) << 1;
		w += r21 & mask;
		/* finish row */
		s += w >> 16;
		/* round */
		s += 8;
		s >>= 4;
		s = UCLIMIT(s + blk[4] + dc) & 0xff;
		o = 0;
		SPLICE_PIXEL(o, s, 24);

		s = v >> 16;
		s += v & 0xffff;
		s += w >> (16-1);
		/* round */
		s += 8;
		s >>= 4;
		s = UCLIMIT(s + blk[5] + dc) & 0xff;
		SPLICE_PIXEL(o, s, 16);

		s = w >> 16;
		s += w & 0xffff;
		s += (v & 0xffff) << 1;
		/* round */
		s += 8;
		s >>= 4;
		s = UCLIMIT(s + blk[6] + dc) & 0xff;
		SPLICE_PIXEL(o, s, 8);

		s = w & 0xffff;
		/* round */
		s += 2;
		s >>= 2;
		s = UCLIMIT(s + blk[7] + dc) & 0xff;
		SPLICE_PIXEL(o, s, 0);
		*(u_int*)(out + 4) = o;

		out += stride;
		blk += 8;

		/* roll lines up cache */
		r00 = r10;
		r01 = r11;
		r10 = r20;
		r11 = r21;
	}
	/*
	 * last row
	 */
	s = r20 >> 24;
	o = 0;
	SPLICE_PIXEL(o, UCLIMIT(s + blk[0] + dc) & 0xff, 24);

	s += (r20 >> 15) & 0x1fe;
	s += (r20 >> 8) & 0xff;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[1] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 16);

	s = (r20 >> 16) & 0xff;
	s += (r20 >> 7) & 0x1fe;
	s += r20 & 0xff;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[2] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 8);

	s = (r20 >> 8) & 0xff;
	s += (r20 & 0xff) << 1;
	s += r21 >> 24;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[3] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 0);
	*(u_int*)out = o;

	s = r20 & 0xff;
	s += (r21 >> 23) & 0x1fe;
	s += (r21 >> 16) & 0xff;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[4] + dc) & 0xff;
	o = 0;
	SPLICE_PIXEL(o, s, 24);

	s = r21 >> 24;
	s += (r21 >> 15) & 0x1fe;
	s += (r21 >> 8) & 0xff;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[5] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 16);

	s = (r21 >> 16) & 0xff;
	s += (r21 >> 7) & 0x1fe;
	s += r21 & 0xff;
	/* round */
	s += 2;
	s >>= 2;
	s = UCLIMIT(s + blk[6] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 8);

	/* corner has filter coef 1 */
	s = r21 & 0xff;
	s = UCLIMIT(s + blk[7] + dc) & 0xff;
	SPLICE_PIXEL(o, s, 0);
	*(u_int*)(out + 4) = o;
}

main(int argc, char** argv)
{
	u_char in[64];
	u_char out[64];
	short blk[64];

	if (argc != 2)
		exit(1);

	for (int i = 0; i < 64; ++i) {
		in[i] = 10;
		out[i] = 255;
		blk[i] = 10;
	}
	if (argv[1][0] == 'w')
		addblkf_good(0, blk, in, out, 8);
	else
		addblkf(0, blk, in, out, 8);

	u_char* p = out;
	for (i = 0; i < 8; ++i) {
		for (int j = 0; j < 8; ++j)
			printf("%d\t", *p++);
		printf("\n");
	}
	return (0);
}
