Implemented (most of) the inline assembly in inline.h

This commit is contained in:
Steven Fuller 2001-07-29 18:25:45 +00:00 committed by Patryk Obara
parent 44d4752e83
commit 5c497c61a6
4 changed files with 724 additions and 180 deletions

View file

@ -6,7 +6,7 @@ CXXFLAGS = $(CFLAGS)
LDLIBS = -lm # /home/relnev/ElectricFence-2.2.2/libefence.a
CFLAGS += `sdl-config --cflags`
LDLIBS += `sdl-config --libs`
LDLIBS += -L/usr/X11R6/lib -lX11 -lXext `sdl-config --libs`
AFLAGS = -g -Iinclude/ -w+macro-params -w+orphan-labels -w+number-overflow

View file

@ -2323,8 +2323,6 @@ int DestroyActiveVDB(VIEWDESCRIPTORBLOCK *dblockptr);
void PlatformSpecificVDBInit(VIEWDESCRIPTORBLOCK *vdb);
int SqRoot32(int A);
int SqRoot64(LONGLONGCH *A);
/* CDF 4/2/98 */
int GetOneOverSin(int a);
/* CDF 4/2/98 */

View file

@ -1215,9 +1215,9 @@ fptmp = (b); \
FloatToInt(); \
a = itmp;}
#else /* other compiler ? */
#else
/* #error "Unknown compiler" */
#if 0
void ADD_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
void ADD_LL_PP(LONGLONGCH *c, LONGLONGCH *a);
void SUB_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c);
@ -1240,6 +1240,722 @@ void RotateAndCopyVector_ASM(VECTORCH *v1, VECTORCH *v2, MATRIXCH *m);
int FloatToInt(float);
#define f2i(a, b) { a = FloatToInt(b); }
#endif
/* ADD */
static __inline__ void ADD_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c)
{
/*
_asm
{
mov esi,a
mov edi,b
mov ebx,c
mov eax,[esi]
mov edx,[esi+4]
add eax,[edi]
adc edx,[edi+4]
mov [ebx],eax
mov [ebx+4],edx
}
*/
__asm__("movl 0(%%esi), %%eax \n\t"
"movl 4(%%esi), %%edx \n\t"
"addl 0(%%edi), %%eax \n\t"
"adcl 4(%%edi), %%edx \n\t"
"movl %%eax, 0(%%ebx) \n\t"
"movl %%edx, 4(%%ebx) \n\t"
:
: "S" (a), "D" (b), "b" (c)
: "%eax", "%edx", "memory", "cc"
);
/*
__asm__("movl 0(%%esi), %%eax \n\t"
"movl 4(%%esi), %%edx \n\t"
"addl 0(%%edi), %%eax \n\t"
"adcl 4(%%edi), %%edx \n\t"
: "=a" (c->lo32), "=d" (c->hi32)
: "S" (a), "D" (b)
);
*/
}
/* ADD ++ */
static __inline__ void ADD_LL_PP(LONGLONGCH *c, LONGLONGCH *a)
{
/*
_asm
{
mov edi,c
mov esi,a
mov eax,[esi]
mov edx,[esi+4]
add [edi],eax
adc [edi+4],edx
}
*/
__asm__("movl 0(%%esi), %%eax \n\t"
"movl 4(%%esi), %%edx \n\t"
"addl %%eax, 0(%%edi) \n\t"
"adcl %%edx, 4(%%edi) \n\t"
:
: "D" (c), "S" (a)
: "%eax", "%edx", "memory", "cc"
);
}
/* SUB */
static __inline__ void SUB_LL(LONGLONGCH *a, LONGLONGCH *b, LONGLONGCH *c)
{
/*
_asm
{
mov esi,a
mov edi,b
mov ebx,c
mov eax,[esi]
mov edx,[esi+4]
sub eax,[edi]
sbb edx,[edi+4]
mov [ebx],eax
mov [ebx+4],edx
}
*/
__asm__("movl 0(%%esi), %%eax \n\t"
"movl 4(%%esi), %%edx \n\t"
"subl 0(%%edi), %%eax \n\t"
"sbbl 4(%%edi), %%edx \n\t"
"movl %%eax, 0(%%ebx) \n\t"
"movl %%edx, 4(%%ebx) \n\t"
:
: "S" (a), "D" (b), "b" (c)
: "%eax", "%edx", "memory", "cc"
);
}
/* SUB -- */
static __inline__ void SUB_LL_MM(LONGLONGCH *c, LONGLONGCH *a)
{
/*
_asm
{
mov edi,c
mov esi,a
mov eax,[esi]
mov edx,[esi+4]
sub [edi],eax
sbb [edi+4],edx
}
*/
__asm__("movl 0(%%esi), %%eax \n\t"
"movl 4(%%esi), %%edx \n\t"
"subl %%eax, 0(%%edi) \n\t"
"sbbl %%edx, 4(%%edi) \n\t"
:
: "D" (c), "S" (a)
: "%eax", "%edx", "memory", "cc"
);
}
/*
MUL
This is the multiply we use, the 32 x 32 = 64 widening version
*/
static __inline__ void MUL_I_WIDE(int a, int b, LONGLONGCH *c)
{
/*
_asm
{
mov eax,a
mov ebx,c
imul b
mov [ebx],eax
mov [ebx+4],edx
}
*/
__asm__("imull %0 \n\t"
"movl %%eax, 0(%%ebx) \n\t"
"movl %%edx, 4(%%ebx) \n\t"
:
: "a" (a), "b" (c), "q" (b)
: "%edx", "memory", "cc"
);
}
/*
CMP
This substitutes for ==, >, <, >=, <=
*/
static __inline__ int CMP_LL(LONGLONGCH *a, LONGLONGCH *b)
{
int retval;
/*
_asm
{
mov ebx,a
mov ecx,b
mov eax,[ebx]
mov edx,[ebx+4]
sub eax,[ecx]
sbb edx,[ecx+4]
and edx,edx
jne llnz
and eax,eax
je llgs
llnz:
mov retval,1
and edx,edx
jge llgs
neg retval
llgs:
}
*/
/* TODO */
__asm__("xorl %0, %0 \n\t"
"movl 0(%%ebx), %%eax \n\t"
"movl 4(%%ebx), %%edx \n\t"
"subl 0(%%ecx), %%eax \n\t"
"sbbl 4(%%ecx), %%edx \n\t"
"andl %%edx, %%edx \n\t"
"jne llnz \n\t"
"andl %%eax, %%eax \n\t"
"je llgs \n"
"llnz: \n\t"
"movl $1, %0 \n\t"
"andl %%edx, %%edx \n\t"
"jge llgs \n\t"
"negl %0 \n"
"llgs: \n\t"
: "=r" (retval)
: "b" (a), "c" (b)
: "%eax", "%edx", "memory", "cc"
);
return retval;
}
/* EQUALS */
static __inline__ void EQUALS_LL(LONGLONGCH *a, LONGLONGCH *b)
{
/*
_asm
{
mov edi,a
mov esi,b
mov eax,[esi]
mov edx,[esi+4]
mov [edi],eax
mov [edi+4],edx
}
*/
__asm__("movl 0(%%esi), %%eax \n\t"
"movl 4(%%esi), %%edx \n\t"
"movl %%eax, 0(%%edi) \n\t"
"movl %%edx, 4(%%edi) \n\t"
:
: "D" (a), "S" (b)
: "%eax", "%edx", "memory"
);
}
/* NEGATE */
static __inline__ void NEG_LL(LONGLONGCH *a)
{
/*
_asm
{
mov esi,a
not dword ptr[esi]
not dword ptr[esi+4]
add dword ptr[esi],1
adc dword ptr[esi+4],0
}
*/
__asm__("notl 0(%%esi) \n\t"
"notl 4(%%esi) \n\t"
"addl $1, 0(%%esi) \n\t"
"adcl $0, 4(%%esi) \n\t"
:
: "S" (a)
: "memory", "cc"
);
}
/* ASR */
static __inline__ void ASR_LL(LONGLONGCH *a, int shift)
{
/*
_asm
{
mov esi,a
mov eax,shift
and eax,eax
jle asrdn
asrlp:
sar dword ptr[esi+4],1
rcr dword ptr[esi],1
dec eax
jne asrlp
asrdn:
}
*/
__asm__("andl %%eax, %%eax \n\t"
"jle asrdn \n"
"asrlp: \n\t"
"sarl $1, 4(%%esi) \n\t"
"rcrl $1, 0(%%esi) \n\t"
"decl %%eax \n\t"
"jne asrlp \n"
"asrdn: \n\t"
:
: "S" (a), "a" (shift)
: "memory", "cc"
);
}
/* Convert int to LONGLONGCH */
static __inline__ void IntToLL(LONGLONGCH *a, int *b)
{
/*
_asm
{
mov esi,b
mov edi,a
mov eax,[esi]
cdq
mov [edi],eax
mov [edi+4],edx
}
*/
__asm__("movl 0(%%esi), %%eax \n\t"
"cdq \n\t"
"movl %%eax, 0(%%edi) \n\t"
"movl %%edx, 4(%%edi) \n\t"
:
: "S" (b), "D" (a)
: "%eax", "%edx", "memory", "cc"
);
}
/*
Fixed Point Multiply.
16.16 * 16.16 -> 16.16
or
16.16 * 0.32 -> 0.32
A proper version of this function ought to read
16.16 * 16.16 -> 32.16
but this would require a long long result
Algorithm:
Take the mid 32 bits of the 64 bit result
*/
/*
These functions have been checked for suitability for
a Pentium and look as if they would work adequately.
Might be worth a more detailed look at optimising
them though.
*/
static __inline__ int MUL_FIXED(int a, int b)
{
int retval;
/*
_asm
{
mov eax,a
imul b
shrd eax,edx,16
mov retval,eax
}
*/
/* TODO */
__asm__("imull %0 \n\t"
"shrdl $16, %%edx, %%eax \n\t"
: "=a" (retval)
: "a" (a), "q" (b)
: "%edx", "cc"
);
return retval;
}
/*
Fixed Point Divide - returns a / b
*/
static __inline__ int DIV_FIXED(int a, int b)
{
int retval;
/*
_asm
{
mov eax,a
cdq
rol eax,16
mov dx,ax
xor ax,ax
idiv b
mov retval,eax
}
*/
/* TODO */
__asm__("cdq \n\t"
"roll $16, %%eax \n\t"
"mov %%ax, %%dx \n\t"
"xor %%ax, %%ax \n\t"
"idivl %0 \n\t"
: "=a" (retval)
: "a" (a), "q" (b)
: "%edx", "cc"
);
return retval;
}
/*
Multiply and Divide Functions.
*/
/*
32/32 division
This macro is a function on some other platforms
*/
#define DIV_INT(a, b) ((a) / (b))
/*
A Narrowing 64/32 Division
*/
static __inline__ int NarrowDivide(LONGLONGCH *a, int b)
{
int retval;
/*
_asm
{
mov esi,a
mov eax,[esi]
mov edx,[esi+4]
idiv b
mov retval,eax
}
*/
__asm__("movl 0(%%esi), %%eax \n\t"
"movl 4(%%esi), %%edx \n\t"
"idivl %0 \n\t"
: "=a" (retval)
: "S" (a), "q" (b)
: "%edx", "cc"
);
return retval;
}
/*
This function performs a Widening Multiply followed by a Narrowing Divide.
a = (a * b) / c
*/
static __inline__ int WideMulNarrowDiv(int a, int b, int c)
{
int retval;
/*
_asm
{
mov eax,a
imul b
idiv c
mov retval,eax
}
*/
/* TODO */
__asm__("imull %0 \n\t"
"idivl %1 \n\t"
: "=a" (retval)
: "a" (a), "q" (b), "q" (c)
: "cc"
);
return retval;
}
/*
Function to rotate a VECTORCH using a MATRIXCH
This is the C function
x = MUL_FIXED(m->mat11, v->vx);
x += MUL_FIXED(m->mat21, v->vy);
x += MUL_FIXED(m->mat31, v->vz);
y = MUL_FIXED(m->mat12, v->vx);
y += MUL_FIXED(m->mat22, v->vy);
y += MUL_FIXED(m->mat32, v->vz);
z = MUL_FIXED(m->mat13, v->vx);
z += MUL_FIXED(m->mat23, v->vy);
z += MUL_FIXED(m->mat33, v->vz);
v->vx = x;
v->vy = y;
v->vz = z;
This is the MUL_FIXED inline assembler function
imul edx
shrd eax,edx,16
typedef struct matrixch {
int mat11; 0
int mat12; 4
int mat13; 8
int mat21; 12
int mat22; 16
int mat23; 20
int mat31; 24
int mat32; 28
int mat33; 32
} MATRIXCH;
*/
#if 0 /* TODO if these are needed */
static void RotateVector_ASM(VECTORCH *v, MATRIXCH *m)
{
_asm
{
mov esi,v
mov edi,m
mov eax,[edi + 0]
imul DWORD PTR [esi + 0]
shrd eax,edx,16
mov ecx,eax
mov eax,[edi + 12]
imul DWORD PTR [esi + 4]
shrd eax,edx,16
add ecx,eax
mov eax,[edi + 24]
imul DWORD PTR [esi + 8]
shrd eax,edx,16
add ecx,eax
mov eax,[edi + 4]
imul DWORD PTR [esi + 0]
shrd eax,edx,16
mov ebx,eax
mov eax,[edi + 16]
imul DWORD PTR [esi + 4]
shrd eax,edx,16
add ebx,eax
mov eax,[edi + 28]
imul DWORD PTR [esi + 8]
shrd eax,edx,16
add ebx,eax
mov eax,[edi + 8]
imul DWORD PTR [esi + 0]
shrd eax,edx,16
mov ebp,eax
mov eax,[edi + 20]
imul DWORD PTR [esi + 4]
shrd eax,edx,16
add ebp,eax
mov eax,[edi + 32]
imul DWORD PTR [esi + 8]
shrd eax,edx,16
add ebp,eax
mov [esi + 0],ecx
mov [esi + 4],ebx
mov [esi + 8],ebp
}
}
/*
Here is the same function, this time copying the result to a second vector
*/
static void RotateAndCopyVector_ASM(VECTORCH *v1, VECTORCH *v2, MATRIXCH *m)
{
_asm
{
mov esi,v1
mov edi,m
mov eax,[edi + 0]
imul DWORD PTR [esi + 0]
shrd eax,edx,16
mov ecx,eax
mov eax,[edi + 12]
imul DWORD PTR [esi + 4]
shrd eax,edx,16
add ecx,eax
mov eax,[edi + 24]
imul DWORD PTR [esi + 8]
shrd eax,edx,16
add ecx,eax
mov eax,[edi + 4]
imul DWORD PTR [esi + 0]
shrd eax,edx,16
mov ebx,eax
mov eax,[edi + 16]
imul DWORD PTR [esi + 4]
shrd eax,edx,16
add ebx,eax
mov eax,[edi + 28]
imul DWORD PTR [esi + 8]
shrd eax,edx,16
add ebx,eax
mov eax,[edi + 8]
imul DWORD PTR [esi + 0]
shrd eax,edx,16
mov ebp,eax
mov eax,[edi + 20]
imul DWORD PTR [esi + 4]
shrd eax,edx,16
add ebp,eax
mov eax,[edi + 32]
imul DWORD PTR [esi + 8]
shrd eax,edx,16
add ebp,eax
mov edx,v2
mov [edx + 0],ecx
mov [edx + 4],ebx
mov [edx + 8],ebp
}
}
#endif
#if (SupportFPMathsFunctions || SupportFPSquareRoot)
/*
Square Root
Returns the Square Root of a 32-bit number
*/
extern int sqrt_temp1;
extern int sqrt_temp2;
static __inline__ int SqRoot32(int A)
{
sqrt_temp1 = A;
/*
_asm
{
finit
fild A
fsqrt
fistp temp2
fwait
}
*/
__asm__("finit \n\t"
"fild sqrt_temp1 \n\t"
"fsqrt \n\t"
"fistp sqrt_temp2 \n\t"
"fwait \n\t"
:
:
: "memory", "cc"
);
return sqrt_temp2;
}
#endif
/*
This may look ugly (it is) but it is a MUCH faster way to convert "float" into "int" than
the function call "CHP" used by the WATCOM compiler.
*/
extern float fti_fptmp;
extern int fti_itmp;
static __inline__ int FloatToInt(float fptmp)
{
fti_fptmp = fptmp;
/*
_asm
{
fld fptmp
fistp itmp
}
*/
__asm__("fld fti_fptmp \n\t"
"fistp fti_itmp \n\t"
:
:
: "memory", "cc"
);
return fti_itmp;
}
/*
This macro makes usage of the above function easier and more elegant
*/
#define f2i(a, b) { \
a = FloatToInt(b); \
}
#endif

View file

@ -18,6 +18,11 @@
#include "kshape.h"
#endif
/* globals from inline.h */
int sqrt_temp1;
int sqrt_temp2;
float fti_fptmp;
int fti_itmp;
/*
@ -513,88 +518,6 @@ int WideMul2NarrowDiv(int a, int b, int c, int d, int e)
}
/*
Square Root
Returns the Square Root of a 32-bit number
*/
#if (SupportFPMathsFunctions || SupportFPSquareRoot)
#else
int SqRoot32(int A)
{
unsigned int edx = A;
unsigned int ecx;
unsigned int ax = 0;
unsigned int bx = 0;
unsigned int di = 0;
for(ecx = 15; ecx!=0; ecx--) {
bx <<= 1;
if(edx & 0x80000000) bx |= 1;
edx <<= 1;
bx <<= 1;
if(edx & 0x80000000) bx |= 1;
edx <<= 1;
ax += ax;
di = ax;
di += di;
if(bx > di) {
di++;
ax++;
bx -= di;
}
}
bx <<= 1;
if(edx & 0x80000000) bx |= 1;
edx <<= 1;
bx <<= 1;
if(edx & 0x80000000) bx |= 1;
edx <<= 1;
ax += ax;
di = ax;
di += di;
if(bx > di) {
ax++;
}
return ((int)ax);
}
#endif /* SupportFPMathsFunctions */
/*
Calculate Plane Normal from three POP's
@ -1115,99 +1038,6 @@ int Magnitude(VECTORCH *v)
}
/*
64-bit Square Root returns 32-bit result
All 64-bit operations are now done using the type LONGLONGCH whose format
varies from platform to platform, although it is always 64-bits in size.
NOTE:
Function currently not available to Watcom C users
A Floating point version is STRONGLY advised for the PC anyway
*/
#if 0
int SqRoot64(LONGLONGCH *A)
{
#if 0
unsigned long long edx = *A;
unsigned int eax = 0;
unsigned int ebx = 0;
unsigned int edi = 0;
unsigned int ecx;
unsigned long long TopBit = 0x8000000000000000LL;
for(ecx = 31; ecx != 0; ecx--) {
ebx <<= 1;
if(edx & TopBit) ebx |= 1;
edx <<= 1;
ebx <<= 1;
if(edx & TopBit) ebx |= 1;
edx <<= 1;
eax += eax;
edi = eax;
edi += edi;
if(ebx > edi) {
edi++;
eax++;
ebx -= edi;
}
}
ebx <<= 1;
if(edx & TopBit) ebx |= 1;
edx <<= 1;
ebx <<= 1;
if(edx & TopBit) ebx |= 1;
edx <<= 1;
eax += eax;
edi = eax;
edi += edi;
if(ebx > edi) {
eax++;
}
return eax;
#endif
return (0);
}
#endif /* for #if 0 */
/*
Shift the 64-bit value until is LTE the limit