avp/src/win95/inline.h

#ifndef INLINE_INCLUDED
#define INLINE_INCLUDED

#if SUPPORT_MMX
#include "mmx_math.h"
#endif

/*


 Watcom PC Inline Functions.

 Watcom Standard C does not support the C++ "inline" directive, so these
 functions have been written as inline assembler instead.

*/

#ifdef __cplusplus
extern "C" {
#endif

/*
	Standard macros.  Note that FIXED_TO_INT
	and INT_TO_FIXED are very suboptimal in
	this version!!!
	Also, MUL_INT and ISR are ONLY intended
	to be used in Win95 so that Saturn versions
	of the same code can be compiled using calls
	to hand optimised assembler functions, i.e.
	for code that is never intended to be run on
	a Saturn they are unnecessary.
*/

#define OUR_ABS(x)                (((x) < 0) ? -(x) : (x))
#define OUR_SIGN(x)	             (((x) < 0) ? -1 : +1)
#define OUR_INT_TO_FIXED(x)	 	 (int) ((x) * (65536))
#define OUR_FIXED_TO_INT(x)		 (int) ((x) / (65536))
#define OUR_MUL_INT(a, b)	       ((a) * (b))
#define OUR_ISR(a, shift)		    ((a) >> (shift))


/*

 win95\item.c functions

*/

void InitialiseTriangleArrayData(void);
void* AllocateTriangleArrayData(int tasize);


/*

 General Triangle Array Handler Null Case / Error

*/

void TriangleArrayNullOrError(TRIANGLEARRAY *tarr);


/*

 Item Polygon Triangle Array Functions

*/

void Item_Polygon_PrepareTriangleArray_3(TRIANGLEARRAY *qarr);
void Item_Polygon_PrepareTriangleArray_4(TRIANGLEARRAY *qarr);
void Item_Polygon_PrepareTriangleArray_5(TRIANGLEARRAY *qarr);
void Item_Polygon_PrepareTriangleArray_6(TRIANGLEARRAY *qarr);
void Item_Polygon_PrepareTriangleArray_7(TRIANGLEARRAY *qarr);
void Item_Polygon_PrepareTriangleArray_8(TRIANGLEARRAY *qarr);
void Item_Polygon_PrepareTriangleArray_9(TRIANGLEARRAY *qarr);


/*

 Item Gouraud Polygon Triangle Array Functions

*/

void Item_GouraudPolygon_PrepareTriangleArray_3(TRIANGLEARRAY *qarr);
void Item_GouraudPolygon_PrepareTriangleArray_4(TRIANGLEARRAY *qarr);
void Item_GouraudPolygon_PrepareTriangleArray_5(TRIANGLEARRAY *qarr);
void Item_GouraudPolygon_PrepareTriangleArray_6(TRIANGLEARRAY *qarr);
void Item_GouraudPolygon_PrepareTriangleArray_7(TRIANGLEARRAY *qarr);
void Item_GouraudPolygon_PrepareTriangleArray_8(TRIANGLEARRAY *qarr);
void Item_GouraudPolygon_PrepareTriangleArray_9(TRIANGLEARRAY *qarr);

/*

 Item 2d Textured Polygon Triangle Array Functions

*/

void Item_2dTexturedPolygon_PrepareTriangleArray_3(TRIANGLEARRAY *qarr);
void Item_2dTexturedPolygon_PrepareTriangleArray_4(TRIANGLEARRAY *qarr);
void Item_2dTexturedPolygon_PrepareTriangleArray_5(TRIANGLEARRAY *qarr);
void Item_2dTexturedPolygon_PrepareTriangleArray_6(TRIANGLEARRAY *qarr);
void Item_2dTexturedPolygon_PrepareTriangleArray_7(TRIANGLEARRAY *qarr);
void Item_2dTexturedPolygon_PrepareTriangleArray_8(TRIANGLEARRAY *qarr);
void Item_2dTexturedPolygon_PrepareTriangleArray_9(TRIANGLEARRAY *qarr);

/*

 Item Gouraud 2d Textured Polygon Triangle Array Functions

*/

void Item_Gouraud2dTexturedPolygon_PrepareTriangleArray_3(TRIANGLEARRAY *qarr);
void Item_Gouraud2dTexturedPolygon_PrepareTriangleArray_4(TRIANGLEARRAY *qarr);
void Item_Gouraud2dTexturedPolygon_PrepareTriangleArray_5(TRIANGLEARRAY *qarr);
void Item_Gouraud2dTexturedPolygon_PrepareTriangleArray_6(TRIANGLEARRAY *qarr);
void Item_Gouraud2dTexturedPolygon_PrepareTriangleArray_7(TRIANGLEARRAY *qarr);
void Item_Gouraud2dTexturedPolygon_PrepareTriangleArray_8(TRIANGLEARRAY *qarr);
void Item_Gouraud2dTexturedPolygon_PrepareTriangleArray_9(TRIANGLEARRAY *qarr);


/*

 Item 3d Textured Polygon Triangle Array Functions

*/

void Item_3dTexturedPolygon_PrepareTriangleArray_3(TRIANGLEARRAY *qarr);
void Item_3dTexturedPolygon_PrepareTriangleArray_4(TRIANGLEARRAY *qarr);
void Item_3dTexturedPolygon_PrepareTriangleArray_5(TRIANGLEARRAY *qarr);
void Item_3dTexturedPolygon_PrepareTriangleArray_6(TRIANGLEARRAY *qarr);
void Item_3dTexturedPolygon_PrepareTriangleArray_7(TRIANGLEARRAY *qarr);
void Item_3dTexturedPolygon_PrepareTriangleArray_8(TRIANGLEARRAY *qarr);
void Item_3dTexturedPolygon_PrepareTriangleArray_9(TRIANGLEARRAY *qarr);

/*

 Item Gouraud 3d Textured Polygon Triangle Array Functions

*/

void Item_Gouraud3dTexturedPolygon_PrepareTriangleArray_3(TRIANGLEARRAY *qarr);
void Item_Gouraud3dTexturedPolygon_PrepareTriangleArray_4(TRIANGLEARRAY *qarr);
void Item_Gouraud3dTexturedPolygon_PrepareTriangleArray_5(TRIANGLEARRAY *qarr);
void Item_Gouraud3dTexturedPolygon_PrepareTriangleArray_6(TRIANGLEARRAY *qarr);
void Item_Gouraud3dTexturedPolygon_PrepareTriangleArray_7(TRIANGLEARRAY *qarr);
void Item_Gouraud3dTexturedPolygon_PrepareTriangleArray_8(TRIANGLEARRAY *qarr);
void Item_Gouraud3dTexturedPolygon_PrepareTriangleArray_9(TRIANGLEARRAY *qarr);

/*

 Platform Specific 64-Bit Operator Functions

 Not all compilers support 64-bit operations, and some platforms may not
 even support 64-bit numbers. Support for 64-bit operations is therefore
 provided in the platform specific fucntions below.

 For C++ a mew class could be defined. However the current system is not
 compiled as C++ and the Cygnus GNU C++ is not currently working.

*/


/*
	These functions have been checked for suitability for
	a Pentium and look as if they would pair up okay.
	Might be worth a more detailed look at optimising
	them though.
	Obviously there is a problem with values not being
	loaded into registers for these functions, but this
	may be unavoidable for 64 bit values on a Watcom
	platform.
*/


#ifdef __WATCOMC__ /* inline assember for the Watcom compiler */

/* ADD */

void ADD_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
# pragma aux ADD_LL = \
"mov	eax,[esi]" \
"mov	edx,[esi+4]" \
"add	eax,[edi]" \
"adc	edx,[edi+4]" \
"mov	[ebx],eax" \
"mov	[ebx+4],edx" \
parm[esi] [edi] [ebx] \
modify[eax edx];


/* ADD ++ */

void ADD_LL_PP(LONGLONGCH *c, LONGLONGCH *a);
# pragma aux ADD_LL_PP = \
"mov	eax,[esi]" \
"mov	edx,[esi+4]" \
"add	[edi],eax" \
"adc	[edi+4],edx" \
parm[edi] [esi] \
modify[eax edx];


/* SUB */

void SUB_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
# pragma aux SUB_LL = \
"mov	eax,[esi]" \
"mov	edx,[esi+4]" \
"sub	eax,[edi]" \
"sbb	edx,[edi+4]" \
"mov	[ebx],eax" \
"mov	[ebx+4],edx" \
parm[esi] [edi] [ebx] \
modify[eax edx];


/* SUB -- */

void SUB_LL_MM(LONGLONGCH *c, LONGLONGCH *a);
# pragma aux SUB_LL_MM = \
"mov	eax,[esi]" \
"mov	edx,[esi+4]" \
"sub	[edi],eax" \
"sbb	[edi+4],edx" \
parm[edi] [esi] \
modify[eax edx];


/*

 MUL

 This is the multiply we use, the 32 x 32 = 64 widening version

*/

void MUL_I_WIDE(int a, int b, LONGLONGCH *c);
# pragma aux MUL_I_WIDE = \
"imul	edx"\
"mov	[ebx],eax" \
"mov	[ebx+4],edx" \
parm[eax] [edx] [ebx] \
modify[eax edx];


/*

 CMP

 This substitutes for ==, >, <, >=, <=

*/

int CMP_LL(LONGLONGCH *a, LONGLONGCH *b);
# pragma aux CMP_LL = \
"mov	eax,[ebx]" \
"mov	edx,[ebx+4]" \
"sub	eax,[ecx]" \
"sbb	edx,[ecx+4]" \
"and	edx,edx" \
"jne	llnz" \
"and	eax,eax" \
"jne	llnz" \
"xor	eax,eax" \
"jmp	llgs" \
"llnz:" \
"mov	eax,1" \
"and	edx,edx" \
"jge	llgs" \
"neg	eax" \
"llgs:" \
parm[ebx] [ecx] \
value[eax] \
modify[edx];


/* EQUALS */

void EQUALS_LL(LONGLONGCH *a, LONGLONGCH *b);
# pragma aux EQUALS_LL = \
"mov	eax,[esi]" \
"mov	edx,[esi+4]" \
"mov	[edi],eax" \
"mov	[edi+4],edx" \
parm[edi] [esi] \
modify[eax edx];


/* NEGATE */

void NEG_LL(LONGLONGCH *a);
# pragma aux NEG_LL = \
"not	dword ptr[esi]" \
"not	dword ptr[esi+4]" \
"add	dword ptr[esi],1" \
"adc	dword ptr[esi+4],0" \
parm[esi];


/* ASR */

void ASR_LL(LONGLONGCH *a, int shift);
# pragma aux ASR_LL = \
"and	eax,eax" \
"jle	asrdn" \
"asrlp:" \
"sar	dword ptr[esi+4],1" \
"rcr	dword ptr[esi],1" \
"dec	eax" \
"jne	asrlp" \
"asrdn:" \
parm[esi] [eax];


/* Convert int to LONGLONGCH */

void IntToLL(LONGLONGCH *a, int *b);
# pragma aux IntToLL = \
"mov	eax,[esi]" \
"cdq" \
"mov	[edi],eax" \
"mov	[edi+4],edx" \
parm[edi] [esi] \
modify[eax edx];


/*

 Fixed Point Multiply.


 16.16 * 16.16 -> 16.16
 or
 16.16 * 0.32 -> 0.32

 A proper version of this function ought to read
 16.16 * 16.16 -> 32.16
 but this would require a long long result

 Algorithm:

 Take the mid 32 bits of the 64 bit result

*/

/*
	These functions have been checked for suitability for
	a Pentium and look as if they would work adequately.
	Might be worth a more detailed look at optimising
	them though.
*/

#if 0

int MUL_FIXED(int a, int b);
# pragma aux MUL_FIXED = \
"imul edx" \
"mov ax,dx" \
"rol eax,16" \
parm[eax] [edx] \
value[eax] \
modify[edx];

#else

int MUL_FIXED(int a, int b);
# pragma aux MUL_FIXED = \
"imul edx" \
"shrd	eax,edx,16" \
parm[eax] [edx] \
value[eax] \
modify[edx];

#endif


/*

 Fixed Point Divide - returns a / b

*/

int DIV_FIXED(int a, int b);
# pragma aux DIV_FIXED = \
"cdq" \
"rol eax,16" \
"mov dx,ax" \
"xor ax,ax" \
"idiv ebx" \
parm[eax] [ebx] \
value[eax] \
modify[edx];


/*

 Multiply and Divide Functions.

*/


/*

 32/32 division

 This macro is a function on some other platforms

*/

#define DIV_INT(a, b) ((a) / (b))


/*

 A Narrowing 64/32 Division

*/

int NarrowDivide(LONGLONGCH *a, int b);
# pragma aux NarrowDivide = \
"mov	eax,[esi]" \
"mov	edx,[esi+4]" \
"idiv	ebx" \
parm[esi] [ebx] \
value[eax] \
modify[edx];


/*

 This function performs a Widening Multiply followed by a Narrowing Divide.

 a = (a * b) / c

*/

int WideMulNarrowDiv(int a, int b, int c);
# pragma aux WideMulNarrowDiv = \
"imul	edx"\
"idiv	ebx" \
parm[eax] [edx] [ebx] \
value[eax];


/*

 Function to rotate a VECTORCH using a MATRIXCH

 This is the C function

	x =  MUL_FIXED(m->mat11, v->vx);
	x += MUL_FIXED(m->mat21, v->vy);
	x += MUL_FIXED(m->mat31, v->vz);

	y  = MUL_FIXED(m->mat12, v->vx);
	y += MUL_FIXED(m->mat22, v->vy);
	y += MUL_FIXED(m->mat32, v->vz);

	z  = MUL_FIXED(m->mat13, v->vx);
	z += MUL_FIXED(m->mat23, v->vy);
	z += MUL_FIXED(m->mat33, v->vz);

	v->vx = x;
	v->vy = y;
	v->vz = z;

 This is the MUL_FIXED inline assembler function

	imul edx
	shrd eax,edx,16


typedef struct matrixch {

	int mat11;	0
	int mat12;	4
	int mat13;	8

	int mat21;	12
	int mat22;	16
	int mat23;	20

	int mat31;	24
	int mat32;	28
	int mat33;	32

} MATRIXCH;

*/

void RotateVector_ASM(VECTORCH *v, MATRIXCH *m);
# pragma aux RotateVector_ASM = \
\
"push	eax" \
"push	ebx" \
"push	ecx" \
"push	edx" \
"push	ebp" \
\
"mov	eax,[edi + 0]" \
"imul	DWORD PTR [esi + 0]" \
"shrd	eax,edx,16" \
"mov	ecx,eax"\
"mov	eax,[edi + 12]" \
"imul	DWORD PTR [esi + 4]" \
"shrd	eax,edx,16" \
"add	ecx,eax" \
"mov	eax,[edi + 24]" \
"imul	DWORD PTR [esi + 8]" \
"shrd	eax,edx,16" \
"add	ecx,eax" \
\
"mov	eax,[edi + 4]" \
"imul	DWORD PTR [esi + 0]" \
"shrd	eax,edx,16" \
"mov	ebx,eax"\
"mov	eax,[edi + 16]" \
"imul	DWORD PTR [esi + 4]" \
"shrd	eax,edx,16" \
"add	ebx,eax" \
"mov	eax,[edi + 28]" \
"imul	DWORD PTR [esi + 8]" \
"shrd	eax,edx,16" \
"add	ebx,eax" \
\
"mov	eax,[edi + 8]" \
"imul	DWORD PTR [esi + 0]" \
"shrd	eax,edx,16" \
"mov	ebp,eax"\
"mov	eax,[edi + 20]" \
"imul	DWORD PTR [esi + 4]" \
"shrd	eax,edx,16" \
"add	ebp,eax" \
"mov	eax,[edi + 32]" \
"imul	DWORD PTR [esi + 8]" \
"shrd	eax,edx,16" \
"add	ebp,eax" \
\
"mov	[esi + 0],ecx" \
"mov	[esi + 4],ebx" \
"mov	[esi + 8],ebp" \
\
"pop	ebp" \
"pop	edx" \
"pop	ecx" \
"pop	ebx" \
"pop	eax" \
\
parm[esi] [edi];


/*

 Here is the same function, this time copying the result to a second vector

*/

void RotateAndCopyVector_ASM(VECTORCH *v1, VECTORCH *v2, MATRIXCH *m);
# pragma aux RotateAndCopyVector_ASM = \
\
"push	eax" \
"push	ebx" \
"push	ecx" \
"push	ebp" \
\
"push	edx" \
"mov	eax,[edi + 0]" \
"imul	DWORD PTR [esi + 0]" \
"shrd	eax,edx,16" \
"mov	ecx,eax"\
"mov	eax,[edi + 12]" \
"imul	DWORD PTR [esi + 4]" \
"shrd	eax,edx,16" \
"add	ecx,eax" \
"mov	eax,[edi + 24]" \
"imul	DWORD PTR [esi + 8]" \
"shrd	eax,edx,16" \
"add	ecx,eax" \
\
"mov	eax,[edi + 4]" \
"imul	DWORD PTR [esi + 0]" \
"shrd	eax,edx,16" \
"mov	ebx,eax"\
"mov	eax,[edi + 16]" \
"imul	DWORD PTR [esi + 4]" \
"shrd	eax,edx,16" \
"add	ebx,eax" \
"mov	eax,[edi + 28]" \
"imul	DWORD PTR [esi + 8]" \
"shrd	eax,edx,16" \
"add	ebx,eax" \
\
"mov	eax,[edi + 8]" \
"imul	DWORD PTR [esi + 0]" \
"shrd	eax,edx,16" \
"mov	ebp,eax"\
"mov	eax,[edi + 20]" \
"imul	DWORD PTR [esi + 4]" \
"shrd	eax,edx,16" \
"add	ebp,eax" \
"mov	eax,[edi + 32]" \
"imul	DWORD PTR [esi + 8]" \
"shrd	eax,edx,16" \
"add	ebp,eax" \
\
"pop	edx" \
"mov	[edx + 0],ecx" \
"mov	[edx + 4],ebx" \
"mov	[edx + 8],ebp" \
\
"pop	ebp" \
"pop	ecx" \
"pop	ebx" \
"pop	eax" \
\
parm[esi] [edx] [edi];


#if (SupportFPMathsFunctions || SupportFPSquareRoot)

/*

 Square Root

 Returns the Square Root of a 32-bit number

*/

static long temp;
static long temp2;

int SqRoot32(int A);
# pragma aux SqRoot32 = \
"finit" \
"mov	temp,eax" \
"fild temp" \
"fsqrt" \
"fistp temp2" \
"fwait" \
"mov	eax,temp2" \
parm[eax] \
value[eax];

#endif


/*

 This may look ugly (it is) but it is a MUCH faster way to convert "float" into "int" than
 the function call "CHP" used by the WATCOM compiler.

*/

static float fptmp;
static int itmp;

void FloatToInt(void);
# pragma aux FloatToInt = \
"fld fptmp" \
"fistp itmp";

/*

 This macro makes usage of the above function easier and more elegant

*/

#define f2i(a, b) { \
fptmp = (b); \
FloatToInt(); \
a = itmp;}

#elif defined(_MSC_VER) /* inline assember for the Microsoft compiler */

/* ADD */

static void ADD_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c)
{
	_asm
	{
		mov esi,a
		mov edi,b
		mov ebx,c
		mov	eax,[esi]
		mov	edx,[esi+4]
		add	eax,[edi]
		adc	edx,[edi+4]
		mov	[ebx],eax
		mov	[ebx+4],edx
	}
}

/* ADD ++ */

static void ADD_LL_PP(LONGLONGCH *c, LONGLONGCH *a)
{
	_asm
	{
		mov edi,c
		mov esi,a
		mov	eax,[esi]
		mov	edx,[esi+4]
		add	[edi],eax
		adc	[edi+4],edx
	}
}

/* SUB */

static void SUB_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c)
{
	_asm
	{
		mov esi,a
		mov edi,b
		mov ebx,c
		mov	eax,[esi]
		mov	edx,[esi+4]
		sub	eax,[edi]
		sbb	edx,[edi+4]
		mov	[ebx],eax
		mov	[ebx+4],edx
	}
}

/* SUB -- */

static void SUB_LL_MM(LONGLONGCH *c, LONGLONGCH *a)
{
	_asm
	{
		mov edi,c
		mov esi,a
		mov	eax,[esi]
		mov	edx,[esi+4]
		sub	[edi],eax
		sbb	[edi+4],edx
	}
}

/*

 MUL

 This is the multiply we use, the 32 x 32 = 64 widening version

*/

static void MUL_I_WIDE(int a, int b, LONGLONGCH *c)
{
	_asm
	{
		mov eax,a
		mov ebx,c
		imul b
		mov	[ebx],eax
		mov	[ebx+4],edx
	}
}

/*

 CMP

 This substitutes for ==, >, <, >=, <=

*/

static int CMP_LL(LONGLONGCH *a, LONGLONGCH *b)
{
	int retval = 0;
	_asm
	{
		mov ebx,a
		mov ecx,b
		mov	eax,[ebx]
		mov	edx,[ebx+4]
		sub	eax,[ecx]
		sbb	edx,[ecx+4]
		and	edx,edx
		jne	llnz
		and	eax,eax
		je	llgs
		llnz:
		mov	retval,1
		and	edx,edx
		jge	llgs
		neg	retval
		llgs:
	}
	return retval;
}

/* EQUALS */

static void EQUALS_LL(LONGLONGCH *a, LONGLONGCH *b)
{
	_asm
	{
		mov edi,a
		mov esi,b
		mov	eax,[esi]
		mov	edx,[esi+4]
		mov	[edi],eax
		mov	[edi+4],edx
	}
}

/* NEGATE */

static void NEG_LL(LONGLONGCH *a)
{
	_asm
	{
		mov esi,a
		not	dword ptr[esi]
		not	dword ptr[esi+4]
		add	dword ptr[esi],1
		adc	dword ptr[esi+4],0
	}
}

/* ASR */

static void ASR_LL(LONGLONGCH *a, int shift)
{
	_asm
	{
		mov esi,a
		mov eax,shift
		and	eax,eax
		jle	asrdn
		asrlp:
		sar	dword ptr[esi+4],1
		rcr	dword ptr[esi],1
		dec	eax
		jne	asrlp
		asrdn:
	}
}

/* Convert int to LONGLONGCH */

static void IntToLL(LONGLONGCH *a, int *b)
{
	_asm
	{
		mov esi,b
		mov edi,a
		mov	eax,[esi]
		cdq
		mov	[edi],eax
		mov	[edi+4],edx
	}
}

/*

 Fixed Point Multiply.


 16.16 * 16.16 -> 16.16
 or
 16.16 * 0.32 -> 0.32

 A proper version of this function ought to read
 16.16 * 16.16 -> 32.16
 but this would require a long long result

 Algorithm:

 Take the mid 32 bits of the 64 bit result

*/

/*
	These functions have been checked for suitability for
	a Pentium and look as if they would work adequately.
	Might be worth a more detailed look at optimising
	them though.
*/

static int MUL_FIXED(int a, int b)
{
	int retval;
	_asm
	{
		mov eax,a
		imul b
		shrd eax,edx,16
		mov retval,eax
	}
	return retval;
}

/*

 Fixed Point Divide - returns a / b

*/

static int DIV_FIXED(int a, int b)
{
	int retval;
	_asm
	{
		mov eax,a
		cdq
		rol eax,16
		mov dx,ax
		xor ax,ax
		idiv b
		mov retval,eax
	}
	return retval;
}

/*

 Multiply and Divide Functions.

*/


/*

 32/32 division

 This macro is a function on some other platforms

*/

#define DIV_INT(a, b) ((a) / (b))

/*

 A Narrowing 64/32 Division

*/

static int NarrowDivide(LONGLONGCH *a, int b)
{
	int retval;
	_asm
	{
		mov esi,a
		mov	eax,[esi]
		mov	edx,[esi+4]
		idiv	b
		mov retval,eax
	}
	return retval;
}

/*

 This function performs a Widening Multiply followed by a Narrowing Divide.

 a = (a * b) / c

*/

static int WideMulNarrowDiv(int a, int b, int c)
{
	int retval;
	_asm
	{
		mov eax,a
		imul b
		idiv c
		mov retval,eax
	}
	return retval;
}

/*

 Function to rotate a VECTORCH using a MATRIXCH

 This is the C function

	x =  MUL_FIXED(m->mat11, v->vx);
	x += MUL_FIXED(m->mat21, v->vy);
	x += MUL_FIXED(m->mat31, v->vz);

	y  = MUL_FIXED(m->mat12, v->vx);
	y += MUL_FIXED(m->mat22, v->vy);
	y += MUL_FIXED(m->mat32, v->vz);

	z  = MUL_FIXED(m->mat13, v->vx);
	z += MUL_FIXED(m->mat23, v->vy);
	z += MUL_FIXED(m->mat33, v->vz);

	v->vx = x;
	v->vy = y;
	v->vz = z;

 This is the MUL_FIXED inline assembler function

	imul edx
	shrd eax,edx,16


typedef struct matrixch {

	int mat11;	0
	int mat12;	4
	int mat13;	8

	int mat21;	12
	int mat22;	16
	int mat23;	20

	int mat31;	24
	int mat32;	28
	int mat33;	32

} MATRIXCH;

*/

static void RotateVector_ASM(VECTORCH *v, MATRIXCH *m)
{
	_asm
	{
		mov esi,v
		mov edi,m

		mov	eax,[edi + 0]
		imul	DWORD PTR [esi + 0]
		shrd	eax,edx,16
		mov	ecx,eax
		mov	eax,[edi + 12]
		imul	DWORD PTR [esi + 4]
		shrd	eax,edx,16
		add	ecx,eax
		mov	eax,[edi + 24]
		imul	DWORD PTR [esi + 8]
		shrd	eax,edx,16
		add	ecx,eax

		mov	eax,[edi + 4]
		imul	DWORD PTR [esi + 0]
		shrd	eax,edx,16
		mov	ebx,eax
		mov	eax,[edi + 16]
		imul	DWORD PTR [esi + 4]
		shrd	eax,edx,16
		add	ebx,eax
		mov	eax,[edi + 28]
		imul	DWORD PTR [esi + 8]
		shrd	eax,edx,16
		add	ebx,eax

		mov	eax,[edi + 8]
		imul	DWORD PTR [esi + 0]
		shrd	eax,edx,16
		mov	ebp,eax
		mov	eax,[edi + 20]
		imul	DWORD PTR [esi + 4]
		shrd	eax,edx,16
		add	ebp,eax
		mov	eax,[edi + 32]
		imul	DWORD PTR [esi + 8]
		shrd	eax,edx,16
		add	ebp,eax

		mov	[esi + 0],ecx
		mov	[esi + 4],ebx
		mov	[esi + 8],ebp
	}
}

/*

 Here is the same function, this time copying the result to a second vector

*/

static void RotateAndCopyVector_ASM(VECTORCH *v1, VECTORCH *v2, MATRIXCH *m)
{
	_asm
	{
		mov esi,v1
		mov edi,m

		mov	eax,[edi + 0]
		imul	DWORD PTR [esi + 0]
		shrd	eax,edx,16
		mov	ecx,eax
		mov	eax,[edi + 12]
		imul	DWORD PTR [esi + 4]
		shrd	eax,edx,16
		add	ecx,eax
		mov	eax,[edi + 24]
		imul	DWORD PTR [esi + 8]
		shrd	eax,edx,16
		add	ecx,eax

		mov	eax,[edi + 4]
		imul	DWORD PTR [esi + 0]
		shrd	eax,edx,16
		mov	ebx,eax
		mov	eax,[edi + 16]
		imul	DWORD PTR [esi + 4]
		shrd	eax,edx,16
		add	ebx,eax
		mov	eax,[edi + 28]
		imul	DWORD PTR [esi + 8]
		shrd	eax,edx,16
		add	ebx,eax

		mov	eax,[edi + 8]
		imul	DWORD PTR [esi + 0]
		shrd	eax,edx,16
		mov	ebp,eax
		mov	eax,[edi + 20]
		imul	DWORD PTR [esi + 4]
		shrd	eax,edx,16
		add	ebp,eax
		mov	eax,[edi + 32]
		imul	DWORD PTR [esi + 8]
		shrd	eax,edx,16
		add	ebp,eax

		mov edx,v2
		mov	[edx + 0],ecx
		mov	[edx + 4],ebx
		mov	[edx + 8],ebp
	}
}

#if (SupportFPMathsFunctions || SupportFPSquareRoot)

/*

 Square Root

 Returns the Square Root of a 32-bit number

*/

static long temp;
static long temp2;

static int SqRoot32(int A)
{
	_asm
	{
		finit
		fild A
		fsqrt
		fistp temp2
		fwait
	}
	return (int)temp2;
}

#endif


/*

 This may look ugly (it is) but it is a MUCH faster way to convert "float" into "int" than
 the function call "CHP" used by the WATCOM compiler.

*/

static float fptmp;
static int itmp;

static void FloatToInt(void)
{
	_asm
	{
		fld fptmp
		fistp itmp
	}
}

/*

 This macro makes usage of the above function easier and more elegant

*/

#define f2i(a, b) { \
fptmp = (b); \
FloatToInt(); \
a = itmp;}

#else /* other compiler ? */

/* #error "Unknown compiler" */
void ADD_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
void ADD_LL_PP(LONGLONGCH *c, LONGLONGCH *a);
void SUB_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
void SUB_LL_MM(LONGLONGCH *c, LONGLONGCH *a);
void MUL_I_WIDE(int a, int b, LONGLONGCH *c);
int CMP_LL(LONGLONGCH *a, LONGLONGCH *b);
void EQUALS_LL(LONGLONGCH *a, LONGLONGCH *b);
void NEG_LL(LONGLONGCH *a);
void ASR_LL(LONGLONGCH *a, int shift);
void IntToLL(LONGLONGCH *a, int *b);
int MUL_FIXED(int a, int b);
int DIV_FIXED(int a, int b);

#define DIV_INT(a, b) ((a) / (b))

int NarrowDivide(LONGLONGCH *a, int b);
int WideMulNarrowDiv(int a, int b, int c);
void RotateVector_ASM(VECTORCH *v, MATRIXCH *m);
void RotateAndCopyVector_ASM(VECTORCH *v1, VECTORCH *v2, MATRIXCH *m);
int f2i(float f);

#endif


/* These functions are in plspecfn.c */

int WideMul2NarrowDiv(int a, int b, int c, int d, int e);
int _Dot(VECTORCH *vptr1, VECTORCH *vptr2);
void MakeV(VECTORCH *v1, VECTORCH *v2, VECTORCH *v3);
void AddV(VECTORCH *v1, VECTORCH *v2);
void RotVect(VECTORCH *v, MATRIXCH *m);
void CopyClipPoint(CLIP_POINT *cp1, CLIP_POINT *cp2);

#if SUPPORT_MMX

#define RotateVector(v,m) (use_mmx_math ? MMX_VectorTransform((v),(m)) : _RotateVector((v),(m)))
#define RotateAndCopyVector(v_in,v_out,m) (use_mmx_math ? MMX_VectorTransformed((v_out),(v_in),(m)) : _RotateAndCopyVector((v_in),(v_out),(m)))
#define Dot(v1,v2) (use_mmx_math ? MMXInline_VectorDot((v1),(v2)) : _Dot((v1),(v2)))
#define DotProduct(v1,v2) (use_mmx_math ? MMX_VectorDot((v1),(v2)) : _DotProduct((v1),(v2)))

#else /* ! SUPPORT_MMX */

#define RotateVector(v,m) (_RotateVector((v),(m)))
#define RotateAndCopyVector(v_in,v_out,m) (_RotateAndCopyVector((v_in),(v_out),(m)))
#define Dot(v1,v2) (_Dot((v1),(v2)))
#define DotProduct(v1,v2) (_DotProduct((v1),(v2)))

#endif /* ? SUPPORT_MMX */

#ifdef __cplusplus
}
#endif


#endif