+++ /dev/null
-\r
-; match.asm -- Pentium-Pro optimized version of longest_match()\r
-;\r
-; Updated for zlib 1.1.3 and converted to MASM 6.1x\r
-; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com>\r
-; and Chuck Walbourn <chuckw@kinesoft.com>\r
-; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro>\r
-;\r
-; This is free software; you can redistribute it and/or modify it\r
-; under the terms of the GNU General Public License.\r
-\r
-; Based on match.S\r
-; Written for zlib 1.1.2\r
-; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>\r
-;\r
-; Modified by Gilles Vollant (2005) for add gzhead and gzindex\r
-\r
- .686P\r
- .MODEL FLAT\r
-\r
-;===========================================================================\r
-; EQUATES\r
-;===========================================================================\r
-\r
-MAX_MATCH EQU 258\r
-MIN_MATCH EQU 3\r
-MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1)\r
-MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7))\r
-\r
-;===========================================================================\r
-; STRUCTURES\r
-;===========================================================================\r
-\r
-; This STRUCT assumes a 4-byte alignment\r
-\r
-DEFLATE_STATE STRUCT\r
-ds_strm dd ?\r
-ds_status dd ?\r
-ds_pending_buf dd ?\r
-ds_pending_buf_size dd ?\r
-ds_pending_out dd ?\r
-ds_pending dd ?\r
-ds_wrap dd ?\r
-; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h)\r
-ds_gzhead dd ?\r
-ds_gzindex dd ?\r
-ds_data_type db ?\r
-ds_method db ?\r
- db ? ; padding\r
- db ? ; padding\r
-ds_last_flush dd ?\r
-ds_w_size dd ? ; used\r
-ds_w_bits dd ?\r
-ds_w_mask dd ? ; used\r
-ds_window dd ? ; used\r
-ds_window_size dd ?\r
-ds_prev dd ? ; used\r
-ds_head dd ?\r
-ds_ins_h dd ?\r
-ds_hash_size dd ?\r
-ds_hash_bits dd ?\r
-ds_hash_mask dd ?\r
-ds_hash_shift dd ?\r
-ds_block_start dd ?\r
-ds_match_length dd ? ; used\r
-ds_prev_match dd ? ; used\r
-ds_match_available dd ?\r
-ds_strstart dd ? ; used\r
-ds_match_start dd ? ; used\r
-ds_lookahead dd ? ; used\r
-ds_prev_length dd ? ; used\r
-ds_max_chain_length dd ? ; used\r
-ds_max_laxy_match dd ?\r
-ds_level dd ?\r
-ds_strategy dd ?\r
-ds_good_match dd ? ; used\r
-ds_nice_match dd ? ; used\r
-\r
-; Don't need anymore of the struct for match\r
-DEFLATE_STATE ENDS\r
-\r
-;===========================================================================\r
-; CODE\r
-;===========================================================================\r
-_TEXT SEGMENT\r
-\r
-;---------------------------------------------------------------------------\r
-; match_init\r
-;---------------------------------------------------------------------------\r
- ALIGN 4\r
-PUBLIC _match_init\r
-_match_init PROC\r
- ; no initialization needed\r
- ret\r
-_match_init ENDP\r
-\r
-;---------------------------------------------------------------------------\r
-; uInt longest_match(deflate_state *deflatestate, IPos curmatch)\r
-;---------------------------------------------------------------------------\r
- ALIGN 4\r
-\r
-PUBLIC _longest_match\r
-_longest_match PROC\r
-\r
-; Since this code uses EBP for a scratch register, the stack frame must\r
-; be manually constructed and referenced relative to the ESP register.\r
-\r
-; Stack image\r
-; Variables\r
-chainlenwmask = 0 ; high word: current chain len\r
- ; low word: s->wmask\r
-window = 4 ; local copy of s->window\r
-windowbestlen = 8 ; s->window + bestlen\r
-scanend = 12 ; last two bytes of string\r
-scanstart = 16 ; first two bytes of string\r
-scanalign = 20 ; dword-misalignment of string\r
-nicematch = 24 ; a good enough match size\r
-bestlen = 28 ; size of best match so far\r
-scan = 32 ; ptr to string wanting match\r
-varsize = 36 ; number of bytes (also offset to last saved register)\r
-\r
-; Saved Registers (actually pushed into place)\r
-ebx_save = 36\r
-edi_save = 40\r
-esi_save = 44\r
-ebp_save = 48\r
-\r
-; Parameters\r
-retaddr = 52\r
-deflatestate = 56\r
-curmatch = 60\r
-\r
-; Save registers that the compiler may be using\r
- push ebp\r
- push edi\r
- push esi\r
- push ebx\r
-\r
-; Allocate local variable space\r
- sub esp,varsize\r
-\r
-; Retrieve the function arguments. ecx will hold cur_match\r
-; throughout the entire function. edx will hold the pointer to the\r
-; deflate_state structure during the function's setup (before\r
-; entering the main loop).\r
-\r
- mov edx, [esp+deflatestate]\r
-ASSUME edx:PTR DEFLATE_STATE\r
-\r
- mov ecx, [esp+curmatch]\r
-\r
-; uInt wmask = s->w_mask;\r
-; unsigned chain_length = s->max_chain_length;\r
-; if (s->prev_length >= s->good_match) {\r
-; chain_length >>= 2;\r
-; }\r
-\r
- mov eax, [edx].ds_prev_length\r
- mov ebx, [edx].ds_good_match\r
- cmp eax, ebx\r
- mov eax, [edx].ds_w_mask\r
- mov ebx, [edx].ds_max_chain_length\r
- jl SHORT LastMatchGood\r
- shr ebx, 2\r
-LastMatchGood:\r
-\r
-; chainlen is decremented once beforehand so that the function can\r
-; use the sign flag instead of the zero flag for the exit test.\r
-; It is then shifted into the high word, to make room for the wmask\r
-; value, which it will always accompany.\r
-\r
- dec ebx\r
- shl ebx, 16\r
- or ebx, eax\r
- mov [esp+chainlenwmask], ebx\r
-\r
-; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;\r
-\r
- mov eax, [edx].ds_nice_match\r
- mov ebx, [edx].ds_lookahead\r
- cmp ebx, eax\r
- jl SHORT LookaheadLess\r
- mov ebx, eax\r
-LookaheadLess:\r
- mov [esp+nicematch], ebx\r
-\r
-;/* register Bytef *scan = s->window + s->strstart; */\r
-\r
- mov esi, [edx].ds_window\r
- mov [esp+window], esi\r
- mov ebp, [edx].ds_strstart\r
- lea edi, [esi+ebp]\r
- mov [esp+scan],edi\r
-\r
-;/* Determine how many bytes the scan ptr is off from being */\r
-;/* dword-aligned. */\r
-\r
- mov eax, edi\r
- neg eax\r
- and eax, 3\r
- mov [esp+scanalign], eax\r
-\r
-;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */\r
-;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */\r
-\r
- mov eax, [edx].ds_w_size\r
- sub eax, MIN_LOOKAHEAD\r
- sub ebp, eax\r
- jg SHORT LimitPositive\r
- xor ebp, ebp\r
-LimitPositive:\r
-\r
-;/* int best_len = s->prev_length; */\r
-\r
- mov eax, [edx].ds_prev_length\r
- mov [esp+bestlen], eax\r
-\r
-;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */\r
-\r
- add esi, eax\r
- mov [esp+windowbestlen], esi\r
-\r
-;/* register ush scan_start = *(ushf*)scan; */\r
-;/* register ush scan_end = *(ushf*)(scan+best_len-1); */\r
-;/* Posf *prev = s->prev; */\r
-\r
- movzx ebx, WORD PTR[edi]\r
- mov [esp+scanstart], ebx\r
- movzx ebx, WORD PTR[eax+edi-1]\r
- mov [esp+scanend], ebx\r
- mov edi, [edx].ds_prev\r
-\r
-;/* Jump into the main loop. */\r
-\r
- mov edx, [esp+chainlenwmask]\r
- jmp SHORT LoopEntry\r
-\r
-;/* do {\r
-; * match = s->window + cur_match;\r
-; * if (*(ushf*)(match+best_len-1) != scan_end ||\r
-; * *(ushf*)match != scan_start) continue;\r
-; * [...]\r
-; * } while ((cur_match = prev[cur_match & wmask]) > limit\r
-; * && --chain_length != 0);\r
-; *\r
-; * Here is the inner loop of the function. The function will spend the\r
-; * majority of its time in this loop, and majority of that time will\r
-; * be spent in the first ten instructions.\r
-; *\r
-; * Within this loop:\r
-; * %ebx = scanend\r
-; * %ecx = curmatch\r
-; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)\r
-; * %esi = windowbestlen - i.e., (window + bestlen)\r
-; * %edi = prev\r
-; * %ebp = limit\r
-; */\r
-\r
- ALIGN 4\r
-LookupLoop:\r
- and ecx, edx\r
- movzx ecx, WORD PTR[edi+ecx*2]\r
- cmp ecx, ebp\r
- jbe LeaveNow\r
- sub edx, 000010000H\r
- js LeaveNow\r
-\r
-LoopEntry:\r
- movzx eax, WORD PTR[esi+ecx-1]\r
- cmp eax, ebx\r
- jnz SHORT LookupLoop\r
-\r
- mov eax, [esp+window]\r
- movzx eax, WORD PTR[eax+ecx]\r
- cmp eax, [esp+scanstart]\r
- jnz SHORT LookupLoop\r
-\r
-;/* Store the current value of chainlen. */\r
-\r
- mov [esp+chainlenwmask], edx\r
-\r
-;/* Point %edi to the string under scrutiny, and %esi to the string we */\r
-;/* are hoping to match it up with. In actuality, %esi and %edi are */\r
-;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */\r
-;/* initialized to -(MAX_MATCH_8 - scanalign). */\r
-\r
- mov esi, [esp+window]\r
- mov edi, [esp+scan]\r
- add esi, ecx\r
- mov eax, [esp+scanalign]\r
- mov edx, -MAX_MATCH_8\r
- lea edi, [edi+eax+MAX_MATCH_8]\r
- lea esi, [esi+eax+MAX_MATCH_8]\r
-\r
-;/* Test the strings for equality, 8 bytes at a time. At the end,\r
-; * adjust %edx so that it is offset to the exact byte that mismatched.\r
-; *\r
-; * We already know at this point that the first three bytes of the\r
-; * strings match each other, and they can be safely passed over before\r
-; * starting the compare loop. So what this code does is skip over 0-3\r
-; * bytes, as much as necessary in order to dword-align the %edi\r
-; * pointer. (%esi will still be misaligned three times out of four.)\r
-; *\r
-; * It should be confessed that this loop usually does not represent\r
-; * much of the total running time. Replacing it with a more\r
-; * straightforward "rep cmpsb" would not drastically degrade\r
-; * performance.\r
-; */\r
-\r
-LoopCmps:\r
- mov eax, DWORD PTR[esi+edx]\r
- xor eax, DWORD PTR[edi+edx]\r
- jnz SHORT LeaveLoopCmps\r
-\r
- mov eax, DWORD PTR[esi+edx+4]\r
- xor eax, DWORD PTR[edi+edx+4]\r
- jnz SHORT LeaveLoopCmps4\r
-\r
- add edx, 8\r
- jnz SHORT LoopCmps\r
- jmp LenMaximum\r
- ALIGN 4\r
-\r
-LeaveLoopCmps4:\r
- add edx, 4\r
-\r
-LeaveLoopCmps:\r
- test eax, 00000FFFFH\r
- jnz SHORT LenLower\r
-\r
- add edx, 2\r
- shr eax, 16\r
-\r
-LenLower:\r
- sub al, 1\r
- adc edx, 0\r
-\r
-;/* Calculate the length of the match. If it is longer than MAX_MATCH, */\r
-;/* then automatically accept it as the best possible match and leave. */\r
-\r
- lea eax, [edi+edx]\r
- mov edi, [esp+scan]\r
- sub eax, edi\r
- cmp eax, MAX_MATCH\r
- jge SHORT LenMaximum\r
-\r
-;/* If the length of the match is not longer than the best match we */\r
-;/* have so far, then forget it and return to the lookup loop. */\r
-\r
- mov edx, [esp+deflatestate]\r
- mov ebx, [esp+bestlen]\r
- cmp eax, ebx\r
- jg SHORT LongerMatch\r
- mov esi, [esp+windowbestlen]\r
- mov edi, [edx].ds_prev\r
- mov ebx, [esp+scanend]\r
- mov edx, [esp+chainlenwmask]\r
- jmp LookupLoop\r
- ALIGN 4\r
-\r
-;/* s->match_start = cur_match; */\r
-;/* best_len = len; */\r
-;/* if (len >= nice_match) break; */\r
-;/* scan_end = *(ushf*)(scan+best_len-1); */\r
-\r
-LongerMatch:\r
- mov ebx, [esp+nicematch]\r
- mov [esp+bestlen], eax\r
- mov [edx].ds_match_start, ecx\r
- cmp eax, ebx\r
- jge SHORT LeaveNow\r
- mov esi, [esp+window]\r
- add esi, eax\r
- mov [esp+windowbestlen], esi\r
- movzx ebx, WORD PTR[edi+eax-1]\r
- mov edi, [edx].ds_prev\r
- mov [esp+scanend], ebx\r
- mov edx, [esp+chainlenwmask]\r
- jmp LookupLoop\r
- ALIGN 4\r
-\r
-;/* Accept the current string, with the maximum possible length. */\r
-\r
-LenMaximum:\r
- mov edx, [esp+deflatestate]\r
- mov DWORD PTR[esp+bestlen], MAX_MATCH\r
- mov [edx].ds_match_start, ecx\r
-\r
-;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */\r
-;/* return s->lookahead; */\r
-\r
-LeaveNow:\r
- mov edx, [esp+deflatestate]\r
- mov ebx, [esp+bestlen]\r
- mov eax, [edx].ds_lookahead\r
- cmp ebx, eax\r
- jg SHORT LookaheadRet\r
- mov eax, ebx\r
-LookaheadRet:\r
-\r
-; Restore the stack and return from whence we came.\r
-\r
- add esp, varsize\r
- pop ebx\r
- pop esi\r
- pop edi\r
- pop ebp\r
- ret\r
-\r
-_longest_match ENDP\r
-\r
-_TEXT ENDS\r
-END\r