2 ; match.asm -- Pentium-Pro optimized version of longest_match()
\r
4 ; Updated for zlib 1.1.3 and converted to MASM 6.1x
\r
5 ; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com>
\r
6 ; and Chuck Walbourn <chuckw@kinesoft.com>
\r
7 ; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro>
\r
9 ; This is free software; you can redistribute it and/or modify it
\r
10 ; under the terms of the GNU General Public License.
\r
13 ; Written for zlib 1.1.2
\r
14 ; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
\r
16 ; Modified by Gilles Vollant (2005) for add gzhead and gzindex
\r
21 ;===========================================================================
\r
23 ;===========================================================================
\r
27 MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1)
\r
28 MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7))
\r
30 ;===========================================================================
\r
32 ;===========================================================================
\r
34 ; This STRUCT assumes a 4-byte alignment
\r
36 DEFLATE_STATE STRUCT
\r
40 ds_pending_buf_size dd ?
\r
44 ; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h)
\r
52 ds_w_size dd ? ; used
\r
54 ds_w_mask dd ? ; used
\r
55 ds_window dd ? ; used
\r
65 ds_match_length dd ? ; used
\r
66 ds_prev_match dd ? ; used
\r
67 ds_match_available dd ?
\r
68 ds_strstart dd ? ; used
\r
69 ds_match_start dd ? ; used
\r
70 ds_lookahead dd ? ; used
\r
71 ds_prev_length dd ? ; used
\r
72 ds_max_chain_length dd ? ; used
\r
73 ds_max_laxy_match dd ?
\r
76 ds_good_match dd ? ; used
\r
77 ds_nice_match dd ? ; used
\r
79 ; Don't need anymore of the struct for match
\r
82 ;===========================================================================
\r
84 ;===========================================================================
\r
87 ;---------------------------------------------------------------------------
\r
89 ;---------------------------------------------------------------------------
\r
93 ; no initialization needed
\r
97 ;---------------------------------------------------------------------------
\r
98 ; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
\r
99 ;---------------------------------------------------------------------------
\r
102 PUBLIC _longest_match
\r
103 _longest_match PROC
\r
105 ; Since this code uses EBP for a scratch register, the stack frame must
\r
106 ; be manually constructed and referenced relative to the ESP register.
\r
110 chainlenwmask = 0 ; high word: current chain len
\r
111 ; low word: s->wmask
\r
112 window = 4 ; local copy of s->window
\r
113 windowbestlen = 8 ; s->window + bestlen
\r
114 scanend = 12 ; last two bytes of string
\r
115 scanstart = 16 ; first two bytes of string
\r
116 scanalign = 20 ; dword-misalignment of string
\r
117 nicematch = 24 ; a good enough match size
\r
118 bestlen = 28 ; size of best match so far
\r
119 scan = 32 ; ptr to string wanting match
\r
120 varsize = 36 ; number of bytes (also offset to last saved register)
\r
122 ; Saved Registers (actually pushed into place)
\r
133 ; Save registers that the compiler may be using
\r
139 ; Allocate local variable space
\r
142 ; Retrieve the function arguments. ecx will hold cur_match
\r
143 ; throughout the entire function. edx will hold the pointer to the
\r
144 ; deflate_state structure during the function's setup (before
\r
145 ; entering the main loop).
\r
147 mov edx, [esp+deflatestate]
\r
148 ASSUME edx:PTR DEFLATE_STATE
\r
150 mov ecx, [esp+curmatch]
\r
152 ; uInt wmask = s->w_mask;
\r
153 ; unsigned chain_length = s->max_chain_length;
\r
154 ; if (s->prev_length >= s->good_match) {
\r
155 ; chain_length >>= 2;
\r
158 mov eax, [edx].ds_prev_length
\r
159 mov ebx, [edx].ds_good_match
\r
161 mov eax, [edx].ds_w_mask
\r
162 mov ebx, [edx].ds_max_chain_length
\r
163 jl SHORT LastMatchGood
\r
167 ; chainlen is decremented once beforehand so that the function can
\r
168 ; use the sign flag instead of the zero flag for the exit test.
\r
169 ; It is then shifted into the high word, to make room for the wmask
\r
170 ; value, which it will always accompany.
\r
175 mov [esp+chainlenwmask], ebx
\r
177 ; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
\r
179 mov eax, [edx].ds_nice_match
\r
180 mov ebx, [edx].ds_lookahead
\r
182 jl SHORT LookaheadLess
\r
185 mov [esp+nicematch], ebx
\r
187 ;/* register Bytef *scan = s->window + s->strstart; */
\r
189 mov esi, [edx].ds_window
\r
190 mov [esp+window], esi
\r
191 mov ebp, [edx].ds_strstart
\r
195 ;/* Determine how many bytes the scan ptr is off from being */
\r
196 ;/* dword-aligned. */
\r
201 mov [esp+scanalign], eax
\r
203 ;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
\r
204 ;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
\r
206 mov eax, [edx].ds_w_size
\r
207 sub eax, MIN_LOOKAHEAD
\r
209 jg SHORT LimitPositive
\r
213 ;/* int best_len = s->prev_length; */
\r
215 mov eax, [edx].ds_prev_length
\r
216 mov [esp+bestlen], eax
\r
218 ;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
\r
221 mov [esp+windowbestlen], esi
\r
223 ;/* register ush scan_start = *(ushf*)scan; */
\r
224 ;/* register ush scan_end = *(ushf*)(scan+best_len-1); */
\r
225 ;/* Posf *prev = s->prev; */
\r
227 movzx ebx, WORD PTR[edi]
\r
228 mov [esp+scanstart], ebx
\r
229 movzx ebx, WORD PTR[eax+edi-1]
\r
230 mov [esp+scanend], ebx
\r
231 mov edi, [edx].ds_prev
\r
233 ;/* Jump into the main loop. */
\r
235 mov edx, [esp+chainlenwmask]
\r
236 jmp SHORT LoopEntry
\r
239 ; * match = s->window + cur_match;
\r
240 ; * if (*(ushf*)(match+best_len-1) != scan_end ||
\r
241 ; * *(ushf*)match != scan_start) continue;
\r
243 ; * } while ((cur_match = prev[cur_match & wmask]) > limit
\r
244 ; * && --chain_length != 0);
\r
246 ; * Here is the inner loop of the function. The function will spend the
\r
247 ; * majority of its time in this loop, and majority of that time will
\r
248 ; * be spent in the first ten instructions.
\r
250 ; * Within this loop:
\r
252 ; * %ecx = curmatch
\r
253 ; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
\r
254 ; * %esi = windowbestlen - i.e., (window + bestlen)
\r
262 movzx ecx, WORD PTR[edi+ecx*2]
\r
265 sub edx, 000010000H
\r
269 movzx eax, WORD PTR[esi+ecx-1]
\r
271 jnz SHORT LookupLoop
\r
273 mov eax, [esp+window]
\r
274 movzx eax, WORD PTR[eax+ecx]
\r
275 cmp eax, [esp+scanstart]
\r
276 jnz SHORT LookupLoop
\r
278 ;/* Store the current value of chainlen. */
\r
280 mov [esp+chainlenwmask], edx
\r
282 ;/* Point %edi to the string under scrutiny, and %esi to the string we */
\r
283 ;/* are hoping to match it up with. In actuality, %esi and %edi are */
\r
284 ;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
\r
285 ;/* initialized to -(MAX_MATCH_8 - scanalign). */
\r
287 mov esi, [esp+window]
\r
288 mov edi, [esp+scan]
\r
290 mov eax, [esp+scanalign]
\r
291 mov edx, -MAX_MATCH_8
\r
292 lea edi, [edi+eax+MAX_MATCH_8]
\r
293 lea esi, [esi+eax+MAX_MATCH_8]
\r
295 ;/* Test the strings for equality, 8 bytes at a time. At the end,
\r
296 ; * adjust %edx so that it is offset to the exact byte that mismatched.
\r
298 ; * We already know at this point that the first three bytes of the
\r
299 ; * strings match each other, and they can be safely passed over before
\r
300 ; * starting the compare loop. So what this code does is skip over 0-3
\r
301 ; * bytes, as much as necessary in order to dword-align the %edi
\r
302 ; * pointer. (%esi will still be misaligned three times out of four.)
\r
304 ; * It should be confessed that this loop usually does not represent
\r
305 ; * much of the total running time. Replacing it with a more
\r
306 ; * straightforward "rep cmpsb" would not drastically degrade
\r
311 mov eax, DWORD PTR[esi+edx]
\r
312 xor eax, DWORD PTR[edi+edx]
\r
313 jnz SHORT LeaveLoopCmps
\r
315 mov eax, DWORD PTR[esi+edx+4]
\r
316 xor eax, DWORD PTR[edi+edx+4]
\r
317 jnz SHORT LeaveLoopCmps4
\r
328 test eax, 00000FFFFH
\r
338 ;/* Calculate the length of the match. If it is longer than MAX_MATCH, */
\r
339 ;/* then automatically accept it as the best possible match and leave. */
\r
342 mov edi, [esp+scan]
\r
345 jge SHORT LenMaximum
\r
347 ;/* If the length of the match is not longer than the best match we */
\r
348 ;/* have so far, then forget it and return to the lookup loop. */
\r
350 mov edx, [esp+deflatestate]
\r
351 mov ebx, [esp+bestlen]
\r
353 jg SHORT LongerMatch
\r
354 mov esi, [esp+windowbestlen]
\r
355 mov edi, [edx].ds_prev
\r
356 mov ebx, [esp+scanend]
\r
357 mov edx, [esp+chainlenwmask]
\r
361 ;/* s->match_start = cur_match; */
\r
362 ;/* best_len = len; */
\r
363 ;/* if (len >= nice_match) break; */
\r
364 ;/* scan_end = *(ushf*)(scan+best_len-1); */
\r
367 mov ebx, [esp+nicematch]
\r
368 mov [esp+bestlen], eax
\r
369 mov [edx].ds_match_start, ecx
\r
372 mov esi, [esp+window]
\r
374 mov [esp+windowbestlen], esi
\r
375 movzx ebx, WORD PTR[edi+eax-1]
\r
376 mov edi, [edx].ds_prev
\r
377 mov [esp+scanend], ebx
\r
378 mov edx, [esp+chainlenwmask]
\r
382 ;/* Accept the current string, with the maximum possible length. */
\r
385 mov edx, [esp+deflatestate]
\r
386 mov DWORD PTR[esp+bestlen], MAX_MATCH
\r
387 mov [edx].ds_match_start, ecx
\r
389 ;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
\r
390 ;/* return s->lookahead; */
\r
393 mov edx, [esp+deflatestate]
\r
394 mov ebx, [esp+bestlen]
\r
395 mov eax, [edx].ds_lookahead
\r
397 jg SHORT LookaheadRet
\r
401 ; Restore the stack and return from whence we came.
\r
410 _longest_match ENDP
\r