1 /* match.s -- optional optimized asm version of longest match in deflate.c
2 * Copyright (C) 2002 Free Software Foundation, Inc.
3 * Copyright (C) 1992-1993 Jean-loup Gailly
4 * This is free software; you can redistribute it and/or modify it under the
5 * terms of the GNU General Public License, see the file COPYING.
7 * The 68020 version has been written by Francesco Potorti` <pot@cnuce.cnr.it>
8 * with adaptations by Carsten Steger <stegerc@informatik.tu-muenchen.de>,
9 * Andreas Schwab <schwab@lamothe.informatik.uni-dortmund.de> and
10 * Kristoffer Eriksson <ske@pkmab.se>
12 * The ia64 version has been written by Sverre Jarp (HP Labs) 2001-2002.
13 * Unwind directives and some reformatting for better readability added by
14 * David Mosberger-Tang <davidm@hpl.hp.com>.
17 /* $Id: match.S,v 0.14 1993/06/11 18:33:24 jloup Exp $ */
19 /* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix
20 * external symbols with an underline character '_'.
24 # define _window window
25 # define _match_start match_start
26 # define _prev_length prev_length
27 # define _good_match good_match
28 # define _nice_match nice_match
29 # define _strstart strstart
30 # define _max_chain_length max_chain_length
32 # define _match_init match_init
33 # define _longest_match longest_match
37 error: DYN_ALLOC not yet supported in match.s
40 #if defined(i386) || defined(_I386) || defined(__i386) || defined(__i386__)
42 /* This version is for 386 Unix or OS/2 in 32 bit mode.
43 * Warning: it uses the AT&T syntax: mov source,dest
44 * This file is only optional. If you want to force the C version,
45 * add -DNO_ASM to CFLAGS in Makefile and set OBJA to an empty string.
46 * If you have reduced WSIZE in gzip.h, then change its value below.
47 * This version assumes static allocation of the arrays (-DDYN_ALLOC not used).
53 #define MAX_MATCH2 $128 /* MAX_MATCH/2-1 */
56 #define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
66 /*-----------------------------------------------------------------------
67 * Set match_start to the longest match starting at the given string and
68 * return its length. Matches shorter or equal to prev_length are discarded,
69 * in which case the result is equal to prev_length and match_start is
71 * IN assertions: cur_match is the head of the hash chain for the current
72 * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
75 _longest_match: /* int longest_match(cur_match) */
77 #define cur_match 20(%esp)
78 /* return address */ /* esp+16 */
79 push %ebp /* esp+12 */
87 * chain_length equ ebp
92 mov _max_chain_length,%ebp /* chain_length = max_chain_length */
95 sub MAX_DIST,%edx /* limit = strstart-MAX_DIST */
97 sub %edx,%edx /* limit = NIL */
99 add $2+_window,%edi /* edi = offset(window+strstart+2) */
100 mov _prev_length,%ebx /* best_len = prev_length */
101 movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
102 movw -2(%edi),%cx /* cx = scan[0..1] */
103 cmp _good_match,%ebx /* do we have a good match already? */
105 shr $2,%ebp /* chain_length >>= 2 */
110 /* at this point, edi == scan+2, esi == cur_match */
111 movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */
112 movw -2(%edi),%cx /* cx = scan[0..1] */
115 * at this point, di == scan+2, si == cur_match,
116 * ax = scan[best_len-1..best_len] and cx = scan[0..1]
119 movw _prev(%esi,%esi),%si /* cur_match = prev[cur_match] */
120 /* top word of esi is still 0 */
121 cmp %edx,%esi /* cur_match <= limit ? */
123 dec %ebp /* --chain_length */
126 cmpw _window-1(%ebx,%esi),%ax/* check match at best_len-1 */
128 cmpw _window(%esi),%cx /* check min_match_length match */
131 lea _window+2(%esi),%esi /* si = match */
132 mov %edi,%eax /* ax = scan+2 */
133 mov MAX_MATCH2,%ecx /* scan for at most MAX_MATCH bytes */
134 rep; cmpsw /* loop until mismatch */
135 je maxmatch /* match of length MAX_MATCH? */
137 movb -2(%edi),%cl /* mismatch on first or second byte? */
138 subb -2(%esi),%cl /* cl = 0 if first bytes equal */
139 xchg %edi,%eax /* edi = scan+2, eax = end of scan */
140 sub %edi,%eax /* eax = len */
141 sub %eax,%esi /* esi = cur_match + 2 + offset(window) */
142 sub $2+_window,%esi /* esi = cur_match */
143 subb $1,%cl /* set carry if cl == 0 (cannot use DEC) */
144 adc $0,%eax /* eax = carry ? len+1 : len */
145 cmp %ebx,%eax /* len > best_len ? */
147 mov %esi,_match_start /* match_start = cur_match */
148 mov %eax,%ebx /* ebx = best_len = len */
149 cmp _nice_match,%eax /* len >= nice_match ? */
152 mov %ebx,%eax /* result = eax = best_len */
164 /* ======================== 680x0 version ================================= */
166 #if defined(m68k)||defined(mc68k)||defined(__mc68000__)||defined(__MC68000__)
172 #if defined(__mc68020__) || defined(__MC68020__) || defined(sysV68)
178 #if defined(mc68020) || defined(mc68000)
180 #if (defined(mc68020) || defined(NeXT)) && !defined(UNALIGNED_OK)
181 # define UNALIGNED_OK
184 #ifdef sysV68 /* Try Motorola Delta style */
186 # define GLOBAL(symbol) global symbol
188 # define FILE(filename) file filename
189 # define invert_maybe(src,dst) dst,src
190 # define imm(data) &data
191 # define reg(register) %register
194 # define addql addq.l
199 # define cmpmb cmpm.b
204 # define movel move.l
205 # define movew move.w
206 # define moveb move.b
207 # define moveml movem.l
210 # define subql subq.l
212 # define IndBase(bd,An) (bd,An)
213 # define IndBaseNdxl(bd,An,Xn) (bd,An,Xn.l)
214 # define IndBaseNdxw(bd,An,Xn) (bd,An,Xn.w)
215 # define predec(An) -(An)
216 # define postinc(An) (An)+
218 #else /* default style (Sun 3, NeXT, Amiga, Atari) */
220 # define GLOBAL(symbol) .globl symbol
222 # define FILE(filename) .even
223 # define invert_maybe(src,dst) src,dst
224 # if defined(sun) || defined(mc68k)
225 # define imm(data) #data
227 # define imm(data) \#data
229 # define reg(register) register
232 # if defined(sun) || defined(mc68k)
237 # define IndBase(bd,An) An@(bd)
238 # define IndBaseNdxl(bd,An,Xn) An@(bd,Xn:l)
239 # define IndBaseNdxw(bd,An,Xn) An@(bd,Xn:w)
240 # define predec(An) An@-
241 # define postinc(An) An@+
245 #define Best_Len reg(d0) /* unsigned */
246 #define Cur_Match reg(d1) /* Ipos */
247 #define Loop_Counter reg(d2) /* int */
248 #define Scan_Start reg(d3) /* unsigned short */
249 #define Scan_End reg(d4) /* unsigned short */
250 #define Limit reg(d5) /* IPos */
251 #define Chain_Length reg(d6) /* unsigned */
252 #define Scan_Test reg(d7)
253 #define Scan reg(a0) /* *uch */
254 #define Match reg(a1) /* *uch */
255 #define Prev_Address reg(a2) /* *Pos */
256 #define Scan_Ini reg(a3) /* *uch */
257 #define Match_Ini reg(a4) /* *uch */
258 #define Stack_Pointer reg(sp)
260 #define MAX_MATCH 258
263 #define MAX_DIST (WSIZE - MAX_MATCH - MIN_MATCH - 1)
266 GLOBAL (_longest_match)
275 /*-----------------------------------------------------------------------
276 * Set match_start to the longest match starting at the given string and
277 * return its length. Matches shorter or equal to prev_length are discarded,
278 * in which case the result is equal to prev_length and match_start is
280 * IN assertions: cur_match is the head of the hash chain for the current
281 * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
284 /* int longest_match (cur_match) */
287 # define pushreg 15928 /* d2-d6/a2-a4 */
290 # define pushreg 16184 /* d2-d7/a2-a4 */
295 movel IndBase(4,Stack_Pointer),Cur_Match
296 moveml imm(pushreg),predec(Stack_Pointer)
297 movel _max_chain_length,Chain_Length
298 movel _prev_length,Best_Len
299 movel imm(_prev),Prev_Address
300 movel imm(_window+MIN_MATCH),Match_Ini
301 movel _strstart,Limit
302 movel Match_Ini,Scan_Ini
304 subw imm(MAX_DIST),Limit
308 cmpl invert_maybe(_good_match,Best_Len)
310 lsrl imm(2),Chain_Length
312 subql imm(1),Chain_Length
314 movew IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
315 movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
317 moveb IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
318 lslw imm(8),Scan_Start
319 moveb IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start
320 moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
322 moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
328 movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
330 moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
332 moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
336 lslw imm(1),Cur_Match
337 movew IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match
338 cmpw invert_maybe(Limit,Cur_Match)
339 dbls Chain_Length,L__do_scan
343 movel Match_Ini,Match
346 cmpw invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End)
348 cmpw invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start)
351 moveb IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test
352 lslw imm(8),Scan_Test
353 moveb IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test
354 cmpw invert_maybe(Scan_Test,Scan_End)
356 moveb IndBase(-MIN_MATCH,Match),Scan_Test
357 lslw imm(8),Scan_Test
358 moveb IndBase(-MIN_MATCH+1,Match),Scan_Test
359 cmpw invert_maybe(Scan_Test,Scan_Start)
363 movew imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter
366 cmpmb postinc(Match),postinc(Scan)
367 dbne Loop_Counter,L__scan_loop
370 addql imm(MIN_MATCH-1),Scan
371 cmpl invert_maybe(Best_Len,Scan)
374 movel Cur_Match,_match_start
375 cmpl invert_maybe(_nice_match,Best_Len)
378 moveml postinc(Stack_Pointer),imm(popreg)
383 # if defined (__ia64__)
385 /* ======================== ia64 version ================================= */
388 * 'longest_match.S' (assembly program for gzip for the IA-64 architecture)
390 * Optimised for McKinley, but with Merced-compatibility, such as MIB+MIB, used wherever
393 * Copyright: Sverre Jarp (HP Labs) 2001-2002
395 * See deflate.c for c-version
396 * Version 2 - Optimize the outer loop
401 #if __BYTE_ORDER == ____BIG_ENDIAN
411 // 24 rotating register (r32 - r55)
413 #define s_vmatch0 r32
414 #define s_vmatch1 r33
415 #define s_vmatbst r34
416 #define s_vmatbst1 r35
417 #define s_amatblen r36
429 #define s_vstrstart r30
430 #define s_vchainlen r29
431 #define s_awinbest r28
432 #define s_vcurmatch r27
434 #define s_vscanend r25
435 #define s_vscanend1 r24
436 #define s_anicematch r23
441 #define s_awindow r19
442 #define s_amatchstart r18
446 #define s_ascanend r14
448 #define s_vspec_cmatch r11 // next iteration
451 #define s_vbestlen r8 // return register
459 #define p_bn2 p5 // Use in loop (indicating bestlen != 2)
461 #define p_nbs p9 // not new best_len
462 #define p_nnc p10 // not nice_length
466 #define MAX_MATCH 258
469 #define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1
474 #define R_ROTATING 24
480 #define cgtu cmp.gt.unc
481 #define cgeu cmp.ge.unc
482 #define cneu cmp.ne.unc
484 .global longest_match
491 alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING
492 .rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \
493 shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1]
494 .rotp lc[MLAT+SHLAT+2]
495 mova s_vspec_cmatch=in0 // cur_match from input register
496 add s_tm1=@gprel(strstart),gp // a(a(strstart))
498 add s_tm3=@gprel(prev_length),gp // a(a(prev_length))
499 add s_tm5=@ltoff(window),gp // a(a(window))
500 add s_tm6=@ltoff(prev),gp // a(a(prev))
503 ld4 s_vstrstart=[s_tm1] // strstart
504 ld4 s_vbestlen=[s_tm3] // best_len = prev_length
505 brp.loop.imp .cmploop,.cmploop+48
507 add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length))
511 ld8 s_aprev=[s_tm6] // a(prev)
512 ld8 s_awindow=[s_tm5] // a(window)
514 movi0 s_prsave=pr // save predicates
516 add s_tm4=@gprel(good_match),gp // a(a(good_match))
517 add s_tm7=@ltoff(nice_match),gp // a(a(nice_match))
518 add s_tm8=@ltoff(match_start),gp // a(match_start)
521 ld8 s_anicematch=[s_tm7] // a(nice_match)
522 ld8 s_amatchstart=[s_tm8] // a(match_start)
523 .save ar.lc, s_lcsave
524 movi0 s_lcsave=ar.lc // save loop count register
527 add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist
528 cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true'
529 mova s_vcurmatch=s_vspec_cmatch
532 ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length
533 ld4 s_tm4=[s_tm4] // v(good_match)
534 add s_ascan=s_awindow,s_vstrstart // scan=window + strstart
536 sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST
537 add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
538 and s_vspec_cmatch =s_vspec_cmatch,s_wmask
541 add s_amatblen=s_amatch,s_vbestlen //
542 cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2
543 add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len
545 ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++
546 ld1 s_vmatch0=[s_amatch],1
547 cgtu p0,p_no=s_vlimit,r0 // is result positive ?
550 ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total
551 ld1.nt1 s_vmatch1=[s_amatch],2
552 add s_awinbest=s_awindow,s_vbestlen //
555 ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
556 ld1.nt1 s_vmatbst=[s_amatblen],-1
557 (p_no) mova s_vlimit=r0
560 (p_bn2) ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1]
561 (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen]
562 shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev
564 cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ?
567 ld1.nt1 s_vscan3=[s_ascan]
568 ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]
571 (p_shf) shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2
574 ld1.nt1 s_vmatch3=[s_amatch]
575 // p_ll switched on as soon as we get a mismatch:
576 cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0
577 cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend
579 cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1
580 (p_bn2) cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1
581 (p_ll) br.cond.dpnt.many .test_more
587 add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
588 mov s_vcurmatch=s_vspec_cmatch // current value
589 add s_vchainlen=-1,s_vchainlen // --chain_length
591 cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit
592 and s_vspec_cmatch=s_vspec_cmatch,s_wmask
593 (p_end) br.cond.dptk.many .terminate
596 ld1 s_vmatch0=[s_amatch],1 // load match[0]
597 // compute prev[cur_match]:
598 shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev
599 cmp.eq.unc p_end,p0=s_vchainlen,r0
602 add s_amatblen=s_awinbest,s_vcurmatch // match=window + cur_match
603 (p_end) br.cond.dptk.many .terminate
605 }{.mmi // Cycle 2 (short)
606 ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] // get next cur_match
608 }{.mmi // Cycle 3 (short)
609 ld1.nt1 s_vmatbst=[s_amatblen],-1 // load match[best_len]
610 cmp.ne.unc p_ll,p0=r0,r0 // parallel compare initialized as 'false'
612 }{.mmi // Cycle 4 (short)
613 // load match[1] - - note: match += 3 (in total):
614 ld1.nt1 s_vmatch1=[s_amatch],2
617 (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] // load match[best_len-1]
618 }{.mib // Here we (MOST LIKELY) pay a L2-fetch stall
619 // p_ll switched on as soon as we get a mismatch:
620 cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0
621 cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend
622 (p_ll) br.cond.dptk.many .next_iter
625 ld1.nt1 s_vmatch3=[s_amatch]
629 cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1
630 (p_bn2) cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1
631 (p_ll) br.cond.dptk.many .next_iter
635 // We have passed the first hurdle - Are there additional matches ???
639 and s_tm3=7,s_ascan // get byte offset
640 and s_tm4=7,s_amatch // get byte offset
641 movi0 ar.ec=MLAT+SHLAT+2 // NB: One trip more than usual
643 cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3 // does not next one differ?
644 (p_no) br.cond.dptk.many .only3
647 and s_tm1=-8,s_ascan // get aligned address
648 shladd s_tm3=s_tm3,3,r0
649 movi0 ar.lc=31 // 32 times around the loop (8B at a time)
651 and s_tm2=-8,s_amatch // get aligned address
652 shladd s_tm4=s_tm4,3,r0
656 ld8.nt1 scan[1]=[s_tm1],8 // load first chunk
657 sub s_tm5=64,s_tm3 // 64 - amount
660 ld8.nt1 match[1]=[s_tm2],8 // load first chunk
661 sub s_tm6=64,s_tm4 // 64 - amount
662 add s_vlen=-8,s_vlen // will be updated at least once
668 (lc[0]) ld8 scan[0]=[s_tm1],8 // next scan chunk
669 (lc[MLAT+SHLAT+1]) add s_vlen=8,s_vlen
670 (lc[MLAT]) first shscan0[0]=scan[MLAT+1],s_tm3
672 (lc[MLAT+SHLAT+1]) cmp.ne.unc p_no,p0=s_tm7,s_tm8 // break search if !=
673 (lc[MLAT]) first shmatch0[0]=match[MLAT+1],s_tm4
674 (p_no) br.cond.dpnt.many .mismatch
677 (lc[0]) ld8 match[0]=[s_tm2],8
678 // shift left(le) or right(be):
679 (lc[MLAT]) second shscan1[0]=scan[MLAT],s_tm5
680 (lc[MLAT]) second shmatch1[0]=match[MLAT],s_tm6
682 (lc[MLAT+SHLAT]) or s_tm7=shscan0[SHLAT],shscan1[SHLAT]
683 (lc[MLAT+SHLAT]) or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT]
684 br.ctop.dptk.many .cmploop
694 {.mii // Cycle 0 (short)
695 (p_no) pcmp1.eq s_tm2=s_tm7,s_tm8 // find first non-matching character
699 (p_no) count s_tm1=s_tm2
701 }{.mib // Cycle 2 (short)
702 (p_no) add s_vlen=s_vlen,s_tm1 // effective length
709 {.mib // Cycle 0 (short)
710 cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen // (len > best_len) ?
711 (p_nbs) br.cond.dpnt.many .next_iter // if not, re-iternate
713 }{.mmi // Cycle 1 (short)
714 ld4 s_tm7=[s_anicematch] // nice_match
715 st4 [s_amatchstart]= s_vcurmatch
716 add s_ascanend=s_ascan,s_vlen // reset with best_len
718 }{.mmi // Cycle 2 (short)
719 mova s_vbestlen=s_vlen
720 add s_ascanend=-3,s_ascanend // remember extra offset
722 }{.mmi // Cycle 3 (short)
723 ld1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
724 add s_awinbest=s_awindow,s_vbestlen // update with new best_len
725 cmp.ne.unc p_bn2,p0=2,s_vbestlen // set if bestlen != 2
727 }{.mib // Cycle 4 (short)
728 // scan_end1=scan[best_len-1] NB: s_ascanend reset:
729 ld1.nt1 s_vscanend1=[s_ascanend],1
730 cmp.lt.unc p_nnc,p0=s_vlen,s_tm7 // compare with nice_match
731 (p_nnc) br.cond.dptk.many .next_iter
742 br.ret.sptk.many rp // ret0 is identical to best_len
755 error: this asm version is for 386 or 680x0 or ia64 only
756 # endif /* __ia64__ */
757 #endif /* mc68000 || mc68020 */
758 #endif /* i386 || _I386 */