X-Git-Url: https://git.cworth.org/git?a=blobdiff_plain;f=match.c;h=50b48f88937ddec7816050f03b2f76f90ab5502e;hb=c9eb602f6f5c10a521e7274640033f6a35deca1d;hp=4a3d681c3d2bcc95283745369a54d06ed93f88a5;hpb=302189d124ed5849c2589ea92e912eb24fdc4ab3;p=gzip

diff --git a/match.c b/match.c
index 4a3d681..50b48f8 100644
--- a/match.c
+++ b/match.c
@@ -1,4 +1,5 @@
 /* match.s -- optional optimized asm version of longest match in deflate.c
+ * Copyright (C) 2002 Free Software Foundation, Inc.
  * Copyright (C) 1992-1993 Jean-loup Gailly
  * This is free software; you can redistribute it and/or modify it under the
  * terms of the GNU General Public License, see the file COPYING.
@@ -7,8 +8,12 @@
  * with adaptations by Carsten Steger <stegerc@informatik.tu-muenchen.de>,
  * Andreas Schwab <schwab@lamothe.informatik.uni-dortmund.de> and
  * Kristoffer Eriksson <ske@pkmab.se>
+ *
+ * The ia64 version has been written by Sverre Jarp (HP Labs) 2001-2002.
+ * Unwind directives and some reformatting for better readability added by
+ * David Mosberger-Tang <davidm@hpl.hp.com>.
  */
- 
+
 /* $Id: match.S,v 0.14 1993/06/11 18:33:24 jloup Exp $ */
 
 /* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix
@@ -32,7 +37,7 @@
   error: DYN_ALLOC not yet supported in match.s
 #endif
 
-#if defined(i386) || defined(_I386)
+#if defined(i386) || defined(_I386) || defined(__i386) || defined(__i386__)
 
 /* This version is for 386 Unix or OS/2 in 32 bit mode.
  * Warning: it uses the AT&T syntax: mov source,dest
@@ -374,6 +379,380 @@ L__return:
 	rts
 
 #else
- error: this asm version is for 386 or 680x0 only
+
+# if defined (__ia64__)
+
+/* ======================== ia64 version ================================= */
+
+/*
+ * 'longest_match.S' (assembly program for gzip for the IA-64 architecture)
+ *
+ * Optimised for McKinley, but with Merced-compatibility, such as MIB+MIB, used wherever
+ * possible.
+ *
+ * Copyright: Sverre Jarp (HP Labs) 2001-2002
+ *
+ * See deflate.c for c-version
+ * Version 2 - Optimize the outer loop
+ */
+
+#include <endian.h>
+
+#if __BYTE_ORDER == ____BIG_ENDIAN
+#define  first  shl
+#define  second shr.u
+#define  count  czx1.l
+#else
+#define  first  shr.u
+#define  second shl
+#define  count  czx1.r
+#endif
+
+// 24 rotating register (r32 - r55)
+
+#define s_vmatch0		r32
+#define s_vmatch1		r33
+#define s_vmatbst		r34
+#define s_vmatbst1		r35
+#define s_amatblen		r36
+
+#define s_tm1			r56
+#define s_tm2			r57
+#define s_tm3			r58
+#define s_tm4			r59
+#define s_tm5			r60
+#define	s_tm6			r61
+#define s_tm7			r62
+#define s_tm8			r63
+
+#define s_vlen			r31
+#define s_vstrstart		r30
+#define s_vchainlen		r29
+#define s_awinbest		r28
+#define s_vcurmatch		r27
+#define s_vlimit		r26
+#define s_vscanend		r25
+#define s_vscanend1		r24
+#define s_anicematch		r23
+#define	s_vscan0		r22
+#define s_vscan1		r21
+
+#define s_aprev			r20
+#define s_awindow		r19
+#define s_amatchstart		r18
+#define s_ascan			r17
+#define s_amatch		r16
+#define s_wmask			r15
+#define s_ascanend		r14
+
+#define s_vspec_cmatch		r11		// next iteration
+#define s_lcsave		r10
+#define s_prsave		r9
+#define s_vbestlen		r8		// return register
+
+#define s_vscan3		r3
+#define s_vmatch3		r2
+
+#define p_no			p2
+#define p_yes			p3
+#define p_shf			p4		//
+#define p_bn2			p5		// Use in loop (indicating bestlen != 2)
+
+#define p_nbs			p9		// not new best_len
+#define p_nnc			p10		// not nice_length
+#define p_ll			p11
+#define p_end			p12
+
+#define MAX_MATCH		258
+#define MIN_MATCH		  4
+#define WSIZE		      32768
+#define MAX_DIST		WSIZE - MAX_MATCH - MIN_MATCH - 1
+
+#define	R_INPUT			1
+#define R_LOCAL			31
+#define	R_OUTPUT		0
+#define R_ROTATING		24
+#define MLAT			3
+#define SHLAT			2
+
+#define	mova			mov
+#define movi0			mov
+#define cgtu			cmp.gt.unc
+#define cgeu			cmp.ge.unc
+#define cneu			cmp.ne.unc
+
+	.global longest_match
+	.proc longest_match
+	.align 32
+longest_match:
+// --  Cycle: 0
+	.prologue
+{.mmi
+	 alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING
+	.rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \
+	      shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1]
+	.rotp lc[MLAT+SHLAT+2]
+	mova s_vspec_cmatch=in0 // cur_match from input register
+	add s_tm1=@gprel(strstart),gp // a(a(strstart))
+}{.mmi
+	add s_tm3=@gprel(prev_length),gp // a(a(prev_length))
+	add s_tm5=@ltoff(window),gp // a(a(window))
+	add s_tm6=@ltoff(prev),gp // a(a(prev))
+	;;
+}{.mmb	//  Cycle: 1
+	ld4 s_vstrstart=[s_tm1] // strstart
+	ld4 s_vbestlen=[s_tm3] // best_len = prev_length
+	brp.loop.imp .cmploop,.cmploop+48
+}{.mli
+	add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length))
+	movl s_wmask=WSIZE-1
+	;;
+}{.mmi	//  Cycle: 2
+	ld8 s_aprev=[s_tm6] // a(prev)
+	ld8 s_awindow=[s_tm5] // a(window)
+	.save pr, s_prsave
+	movi0 s_prsave=pr // save predicates
+}{.mmi
+	add s_tm4=@gprel(good_match),gp // a(a(good_match))
+	add s_tm7=@ltoff(nice_match),gp // a(a(nice_match))
+	add s_tm8=@ltoff(match_start),gp // a(match_start)
+	;;
+}{.mmi	//  Cycle: 3
+	ld8 s_anicematch=[s_tm7] // a(nice_match)
+	ld8 s_amatchstart=[s_tm8] // a(match_start)
+	.save ar.lc, s_lcsave
+	movi0 s_lcsave=ar.lc // save loop count register
+}{.mmi
+	.body
+	add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist
+	cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true'
+	mova s_vcurmatch=s_vspec_cmatch
+	;;
+}{.mmi	//  Cycle: 4
+	ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length
+	ld4 s_tm4=[s_tm4] // v(good_match)
+	add s_ascan=s_awindow,s_vstrstart // scan=window + strstart
+}{.mmi
+	sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST
+	add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
+	and s_vspec_cmatch =s_vspec_cmatch,s_wmask
+	;;
+}{.mmi	//  Cycle: 5
+	add s_amatblen=s_amatch,s_vbestlen //
+	cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2
+	add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len
+}{.mmi
+	ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++
+	ld1 s_vmatch0=[s_amatch],1
+	cgtu p0,p_no=s_vlimit,r0 // is result positive ?
+	;;
+}{.mmi	//  Cycle: 6
+	ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total
+	ld1.nt1 s_vmatch1=[s_amatch],2
+	add s_awinbest=s_awindow,s_vbestlen //
+	;;
+}{.mmi	//  Cycle: 7
+	ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
+	ld1.nt1 s_vmatbst=[s_amatblen],-1
+(p_no)	mova s_vlimit=r0
+	;;
+}{.mmi	//  Cycle: 8
+(p_bn2)	ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1]
+(p_bn2)	ld1.nt1 s_vmatbst1=[s_amatblen]
+	shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev
+}{.mmi
+	cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ?
+	;;
+}{.mmi	//  Cycle: 9
+	ld1.nt1 s_vscan3=[s_ascan]
+	ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]
+	mova	s_vlen=3
+}{.mmi
+(p_shf)	shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2
+	;;
+}{.mmi	//  Cycle: 10
+	ld1.nt1 s_vmatch3=[s_amatch]
+	// p_ll switched on as soon as we get a mismatch:
+	cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0
+	cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend
+}{.mib
+	cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1
+(p_bn2)	cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1
+(p_ll)	br.cond.dpnt.many .test_more
+	;;
+}
+
+.next_iter:
+{.mmi	// Cycle 0
+	add s_amatch=s_awindow,s_vspec_cmatch  	// match=window + cur_match
+	mov s_vcurmatch=s_vspec_cmatch		// current value
+	add s_vchainlen=-1,s_vchainlen 		// --chain_length
+}{.mib
+	cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit
+	and s_vspec_cmatch=s_vspec_cmatch,s_wmask
+(p_end)	br.cond.dptk.many .terminate
+	;;
+}{.mmi	// Cycle 1
+	ld1 s_vmatch0=[s_amatch],1		// load match[0]
+	// compute prev[cur_match]:
+	shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev
+	cmp.eq.unc p_end,p0=s_vchainlen,r0
+} {.mib
+	nop.m 0
+	add s_amatblen=s_awinbest,s_vcurmatch	// match=window + cur_match
+(p_end)	br.cond.dptk.many .terminate
+	;;
+}{.mmi	// Cycle 2 (short)
+	ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]		// get next cur_match
+	;;
+}{.mmi	// Cycle 3 (short)
+	ld1.nt1 s_vmatbst=[s_amatblen],-1	// load match[best_len]
+	cmp.ne.unc p_ll,p0=r0,r0     // parallel compare initialized as 'false'
+	;;
+}{.mmi	// Cycle 4 (short)
+	// load match[1] - - note: match += 3 (in total):
+	ld1.nt1 s_vmatch1=[s_amatch],2
+	;;
+	// Cycle 5  (short)
+(p_bn2)	ld1.nt1 s_vmatbst1=[s_amatblen]		// load match[best_len-1]
+}{.mib	// Here we (MOST LIKELY) pay a L2-fetch stall
+	// p_ll switched on as soon as we get a mismatch:
+	cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0
+	cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend
+(p_ll)	br.cond.dptk.many .next_iter
+	;;
+}{.mmi	// Cycle 6
+	ld1.nt1 s_vmatch3=[s_amatch]
+	mova s_vlen=3
+	nop.i 0
+}{.mib
+	cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1
+(p_bn2)	cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1
+(p_ll)	br.cond.dptk.many .next_iter
+	;;
+}
+
+// We have passed the first hurdle - Are there additional matches ???
+
+.test_more:
+{.mmi	// Cycle 0
+	and s_tm3=7,s_ascan			// get byte offset
+	and s_tm4=7,s_amatch			// get byte offset
+	movi0 ar.ec=MLAT+SHLAT+2		// NB: One trip more than usual
+}{.mib
+	cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3	// does not next one differ?
+(p_no)  br.cond.dptk.many .only3
+	;;
+}{.mmi	// Cycle 1
+	and s_tm1=-8,s_ascan	// get aligned address
+	shladd s_tm3=s_tm3,3,r0
+	movi0 ar.lc=31		// 32 times around the loop (8B at a time)
+}{.mib
+	and s_tm2=-8,s_amatch			// get aligned address
+	shladd s_tm4=s_tm4,3,r0
+	nop.b 0
+	;;
+}{.mmi	// Cycle 2
+	ld8.nt1 scan[1]=[s_tm1],8			// load first chunk
+	sub s_tm5=64,s_tm3				// 64 - amount
+	movi0 pr.rot=1<<16
+}{.mmi
+	ld8.nt1 match[1]=[s_tm2],8	// load first chunk
+	sub s_tm6=64,s_tm4		// 64 - amount
+	add s_vlen=-8,s_vlen		// will be updated at least once
+	;;
+}
+	.align 32
+.cmploop:
+{.mmi	// Cycle 0
+(lc[0])			ld8 scan[0]=[s_tm1],8		// next scan chunk
+(lc[MLAT+SHLAT+1]) 	add s_vlen=8,s_vlen
+(lc[MLAT])		first shscan0[0]=scan[MLAT+1],s_tm3
+}{.mib
+(lc[MLAT+SHLAT+1]) 	cmp.ne.unc p_no,p0=s_tm7,s_tm8	// break search if !=
+(lc[MLAT])		first shmatch0[0]=match[MLAT+1],s_tm4
+(p_no)			br.cond.dpnt.many .mismatch
+			;;
+}{.mii  // Cycle 1
+(lc[0])			ld8 match[0]=[s_tm2],8
+			// shift left(le) or right(be):
+(lc[MLAT])		second shscan1[0]=scan[MLAT],s_tm5
+(lc[MLAT])		second shmatch1[0]=match[MLAT],s_tm6
+}{.mmb
+(lc[MLAT+SHLAT])	or s_tm7=shscan0[SHLAT],shscan1[SHLAT]
+(lc[MLAT+SHLAT])	or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT]
+			br.ctop.dptk.many .cmploop
+			;;
+}{.mfi
+	mov s_vlen=258
+	nop.f 0
+}{.mfi
+	nop.f 0    // realign
+	;;
+}
+.mismatch:
+{.mii	// Cycle 0 (short)
+(p_no)	pcmp1.eq s_tm2=s_tm7,s_tm8 	// find first non-matching character
+	nop.i 0
+	;;
+	// Cycle 1 (short)
+(p_no)	count s_tm1=s_tm2
+	;;
+}{.mib	// Cycle 2 (short)
+(p_no)	add s_vlen=s_vlen,s_tm1		// effective length
+	nop.i 0
+	clrrrb
+	;;
+}
+
+.only3:
+{.mib	// Cycle 0 (short)
+	cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen		// (len > best_len) ?
+(p_nbs)	br.cond.dpnt.many .next_iter			// if not, re-iternate
+	;;
+}{.mmi	// Cycle 1 (short)
+	ld4 s_tm7=[s_anicematch] 			// nice_match
+	st4 [s_amatchstart]= s_vcurmatch
+	add s_ascanend=s_ascan,s_vlen			// reset with best_len
+	;;
+}{.mmi	// Cycle 2 (short)
+	mova s_vbestlen=s_vlen
+	add s_ascanend=-3,s_ascanend		// remember extra offset
+	;;
+}{.mmi	// Cycle 3 (short)
+	ld1 s_vscanend=[s_ascanend],-1		// scan_end=scan[best_len]
+	add s_awinbest=s_awindow,s_vbestlen	// update with new best_len
+  	cmp.ne.unc p_bn2,p0=2,s_vbestlen	// set if bestlen != 2
+	;;
+}{.mib	// Cycle 4 (short)
+	// scan_end1=scan[best_len-1] NB: s_ascanend reset:
+	ld1.nt1 s_vscanend1=[s_ascanend],1
+	cmp.lt.unc p_nnc,p0=s_vlen,s_tm7	// compare with nice_match
+(p_nnc)	br.cond.dptk.many .next_iter
+	;;
+}
+.terminate:
+{.mii	// Cycle 0/1
+	nop.m 0
+	movi0 ar.lc=s_lcsave
+	movi0 pr=s_prsave,-1
+}{.mbb
+	nop.m 0
+	nop.b 0
+	br.ret.sptk.many rp	// ret0 is identical to best_len
+	;;
+}
+	.endp
+
+	.global match_init
+	.proc match_init
+match_init:
+	sub ret0=ret0,ret0
+	br.ret.sptk.many rp
+	.endp
+
+# else
+ error: this asm version is for 386 or 680x0 or ia64 only 
+# endif /* __ia64__ */
 #endif /* mc68000 || mc68020 */
 #endif /* i386 || _I386   */