From 848143b85c32aef8cf33ef32f3d82a4d5cd6534c Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Tue, 27 Dec 2022 19:32:11 +0100 Subject: [PATCH 01/17] - add "lz77_compressor" class with compress and decompress methods using LZ77 compression algorithm --- compression/lz77.py | 197 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 compression/lz77.py diff --git a/compression/lz77.py b/compression/lz77.py new file mode 100644 index 000000000000..6844dde143f7 --- /dev/null +++ b/compression/lz77.py @@ -0,0 +1,197 @@ +""" +LZ77 compression algorithm +- lossless data compression published in papers by Abraham Lempel and Jacob Ziv in 1977 +- also known as LZ1 or sliding-window compression +- form the basis for many variations including LZW, LZSS, LZMA and others + +It uses a “sliding window” method. Within the sliding window we have: + - search buffer + - look ahead buffer +len(sliding_window) = len(search_buffer) + len(look_ahead_buffer) + +LZ77 manages a dictionary that uses triples composed of: + - Offset into search buffer, it's the distance between the start of a phrase and + the beginning of a file. + - Length of the match, it's the number of characters that make up a phrase. + - The indicator is represented by a character that is going to be encoded next. + +As a file is parsed, the dictionary is dynamically updated to reflect the compressed +data contents and size. + +Examples: +"cabracadabrarrarrad" <-> [(0, 0, 'c'), (0, 0, 'a'), (0, 0, 'b'), (0, 0, 'r'), + (3, 1, 'c'), (2, 1, 'd'), (7, 4, 'r'), (3, 5, 'd')] +"ababcbababaa" <-> [(0, 0, 'a'), (0, 0, 'b'), (2, 2, 'c'), (4, 3, 'a'), (2, 2, 'a')] +"aacaacabcabaaac" <-> [(0, 0, 'a'), (1, 1, 'c'), (3, 4, 'b'), (3, 3, 'a'), (1, 2, 'c')] + +Sources: +en.wikipedia.org/wiki/LZ77_and_LZ78 +""" + +from typing import List, Tuple + +__version__ = '0.1' +__author__ = 'Lucia Harcekova' + + +class LZ77Compressor: + """ + Class containg compress and decompress methods using LZ77 compression algorithm. + """ + + def __init__(self, window_size=13, lookahead_buffer_size=6): + self.window_size = window_size + self.lookahead_buffer_size = lookahead_buffer_size + self.search_buffer_size = self.window_size - self.lookahead_buffer_size + + def compress(self, text: str) -> List[Tuple[int, int, str]]: + """This method compresses given string text using LZ77 compression algorithm. + + Args: + text (str): string that's going to be compressed + + Returns: + output (List[Tuple[int, int, str]]): the compressed text + + Tests: + >>> lz77_compressor = LZ77Compressor(13, 6) + >>> lz77_compressor.compress("ababcbababaa") + [(0, 0, 'a'), (0, 0, 'b'), (2, 2, 'c'), (4, 3, 'a'), (2, 2, 'a')] + >>> lz77_compressor.compress("aacaacabcabaaac") + [(0, 0, 'a'), (1, 1, 'c'), (3, 4, 'b'), (3, 3, 'a'), (1, 2, 'c')] + """ + + output = [] + search_buffer = "" + + # while there are still characters in text to compress + while text: + + # find the next encoding phrase + # - triplet with offset, length, indicator (the next encoding character) + (offset, length, indicator) = self._find_encoding_token( + text, search_buffer) + + # update the search buffer: + # - add new characters from text into it + # - check if size exceed the max search buffer size, if so, drop the + # oldest elements + search_buffer += text[:length+1] + if len(search_buffer) > self.search_buffer_size: + search_buffer = search_buffer[-self.search_buffer_size:] + + # update the text + text = text[length+1:] + + # append the token to output + output.append((offset, length, indicator)) + + return output + + def decompress(self, tokens: List[Tuple[int, int, str]]) -> str: + """This method turns the list of tokens consisting of triplets of the form + (offset, length, char), into an output string. + + Args: + tokens (List[Tuple[int, int, str]]): Tokens (offset, length, char) + + Returns: + output (str): The decompressed text + + Tests: + >>> lz77_compressor = LZ77Compressor(13, 6) + >>> lz77_compressor.decompress([(0, 0, 'c'), (0, 0, 'a'), (0, 0, 'b'), \ + (0, 0, 'r'), (3, 1, 'c'), (2, 1, 'd'), (7, 4, 'r'), (3, 5, 'd')]) + 'cabracadabrarrarrad' + >>> lz77_compressor.decompress([(0, 0, 'a'), (0, 0, 'b'), (2, 2, 'c'), \ + (4, 3, 'a'), (2, 2, 'a')]) + 'ababcbababaa' + >>> lz77_compressor.decompress([(0, 0, 'a'), (1, 1, 'c'), (3, 4, 'b'), \ + (3, 3, 'a'), (1, 2, 'c')]) + 'aacaacabcabaaac' + """ + + output = "" + + for (offset, length, indicator) in tokens: + for _ in range(length): + output += output[-offset] + output += indicator + + return output + + def _find_encoding_token(self, text: str, search_buffer: str) \ + -> Tuple[int, int, str]: + """Finds the encoding token for the first character in the text. + + Args: + text (str) + search_buffer (str) + + Returns: + Tuple[int, int, str]: Token + + Tests: + >>> lz77_compressor = LZ77Compressor(13, 6) + >>> lz77_compressor._find_encoding_token("abrarrarrad", "abracad") + (7, 4, 'r') + >>> lz77_compressor._find_encoding_token("adabrarrarrad", "cabrac") + (2, 1, 'd') + """ + + # Initialise result parameters to default values + length, offset = 0, 0 + + if search_buffer == "": + return offset, length, text[length] + + for i, character in enumerate(search_buffer): + found_offset = len(search_buffer) - i + if character == text[0]: + found_length = self._match_length_from_index( + text, search_buffer, 0, i) + # if the found length is bigger than the current or if it's equal, + # which means it's offset is smaller: update offset and length + if found_length >= length: + offset, length = found_offset, found_length + + return offset, length, text[length] + + def _match_length_from_index(self, text: str, + window: str, text_index: int, window_index: int) -> int: + """Calculate the longest possible match of text and window characters from + text_index in text and window_index in window. + + Args: + text (str): _description_ + window (str): sliding window + text_index (int): index of character in text + window_index (int): index of character in sliding window + + Returns: + int: The maximum match between text and window, from given indexes. + + Tests: + >>> lz77_compressor = LZ77Compressor(13, 6) + >>> lz77_compressor._match_length_from_index("rarrad", "adabrar", 0, 4) + 5 + >>> lz77_compressor._match_length_from_index("adabrarrarrad", \ + "cabrac", 0, 1) + 1 + """ + if text == "" or text[text_index] != window[window_index]: + return 0 + return 1 + self._match_length_from_index(text, + window + text[text_index], text_index + 1, window_index + 1) + + +if __name__ == '__main__': + + # Initialize compressor class + lz77_compressor = LZ77Compressor(window_size=13, lookahead_buffer_size=6) + + # Example + TEXT = "cabracadabrarrarrad" + compressed_text = lz77_compressor.compress(TEXT) + decompressed_text = lz77_compressor.decompress(compressed_text) + assert decompressed_text == TEXT, "The LZ77 agirithm returned the invalid result." From ee44d716f72358feabea3f229917a849b7ecaf36 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 Dec 2022 18:43:00 +0000 Subject: [PATCH 02/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- compression/lz77.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index 6844dde143f7..8774c8f1edd7 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -30,8 +30,8 @@ from typing import List, Tuple -__version__ = '0.1' -__author__ = 'Lucia Harcekova' +__version__ = "0.1" +__author__ = "Lucia Harcekova" class LZ77Compressor: @@ -44,7 +44,7 @@ def __init__(self, window_size=13, lookahead_buffer_size=6): self.lookahead_buffer_size = lookahead_buffer_size self.search_buffer_size = self.window_size - self.lookahead_buffer_size - def compress(self, text: str) -> List[Tuple[int, int, str]]: + def compress(self, text: str) -> list[tuple[int, int, str]]: """This method compresses given string text using LZ77 compression algorithm. Args: @@ -69,26 +69,25 @@ def compress(self, text: str) -> List[Tuple[int, int, str]]: # find the next encoding phrase # - triplet with offset, length, indicator (the next encoding character) - (offset, length, indicator) = self._find_encoding_token( - text, search_buffer) + (offset, length, indicator) = self._find_encoding_token(text, search_buffer) # update the search buffer: # - add new characters from text into it # - check if size exceed the max search buffer size, if so, drop the # oldest elements - search_buffer += text[:length+1] + search_buffer += text[: length + 1] if len(search_buffer) > self.search_buffer_size: - search_buffer = search_buffer[-self.search_buffer_size:] + search_buffer = search_buffer[-self.search_buffer_size :] # update the text - text = text[length+1:] + text = text[length + 1 :] # append the token to output output.append((offset, length, indicator)) return output - def decompress(self, tokens: List[Tuple[int, int, str]]) -> str: + def decompress(self, tokens: list[tuple[int, int, str]]) -> str: """This method turns the list of tokens consisting of triplets of the form (offset, length, char), into an output string. @@ -120,8 +119,9 @@ def decompress(self, tokens: List[Tuple[int, int, str]]) -> str: return output - def _find_encoding_token(self, text: str, search_buffer: str) \ - -> Tuple[int, int, str]: + def _find_encoding_token( + self, text: str, search_buffer: str + ) -> tuple[int, int, str]: """Finds the encoding token for the first character in the text. Args: @@ -148,8 +148,7 @@ def _find_encoding_token(self, text: str, search_buffer: str) \ for i, character in enumerate(search_buffer): found_offset = len(search_buffer) - i if character == text[0]: - found_length = self._match_length_from_index( - text, search_buffer, 0, i) + found_length = self._match_length_from_index(text, search_buffer, 0, i) # if the found length is bigger than the current or if it's equal, # which means it's offset is smaller: update offset and length if found_length >= length: @@ -157,8 +156,9 @@ def _find_encoding_token(self, text: str, search_buffer: str) \ return offset, length, text[length] - def _match_length_from_index(self, text: str, - window: str, text_index: int, window_index: int) -> int: + def _match_length_from_index( + self, text: str, window: str, text_index: int, window_index: int + ) -> int: """Calculate the longest possible match of text and window characters from text_index in text and window_index in window. @@ -181,11 +181,12 @@ def _match_length_from_index(self, text: str, """ if text == "" or text[text_index] != window[window_index]: return 0 - return 1 + self._match_length_from_index(text, - window + text[text_index], text_index + 1, window_index + 1) + return 1 + self._match_length_from_index( + text, window + text[text_index], text_index + 1, window_index + 1 + ) -if __name__ == '__main__': +if __name__ == "__main__": # Initialize compressor class lz77_compressor = LZ77Compressor(window_size=13, lookahead_buffer_size=6) From 86c2bb32942d055144cf9915a449c65564386560 Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Tue, 27 Dec 2022 20:28:44 +0100 Subject: [PATCH 03/17] - use "list" instead "List", formatting --- compression/lz77.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index 8774c8f1edd7..191bebbe9395 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -28,7 +28,6 @@ en.wikipedia.org/wiki/LZ77_and_LZ78 """ -from typing import List, Tuple __version__ = "0.1" __author__ = "Lucia Harcekova" @@ -39,19 +38,19 @@ class LZ77Compressor: Class containg compress and decompress methods using LZ77 compression algorithm. """ - def __init__(self, window_size=13, lookahead_buffer_size=6): + def __init__(self, window_size=13, lookahead_buffer_size=6) -> None: self.window_size = window_size self.lookahead_buffer_size = lookahead_buffer_size self.search_buffer_size = self.window_size - self.lookahead_buffer_size - def compress(self, text: str) -> list[tuple[int, int, str]]: + def compress(self, text: str) -> list: """This method compresses given string text using LZ77 compression algorithm. Args: text (str): string that's going to be compressed Returns: - output (List[Tuple[int, int, str]]): the compressed text + output (list): the compressed text Tests: >>> lz77_compressor = LZ77Compressor(13, 6) @@ -87,12 +86,12 @@ def compress(self, text: str) -> list[tuple[int, int, str]]: return output - def decompress(self, tokens: list[tuple[int, int, str]]) -> str: + def decompress(self, tokens: list) -> str: """This method turns the list of tokens consisting of triplets of the form (offset, length, char), into an output string. Args: - tokens (List[Tuple[int, int, str]]): Tokens (offset, length, char) + tokens (list): Tokens (offset, length, char) Returns: output (str): The decompressed text @@ -119,9 +118,7 @@ def decompress(self, tokens: list[tuple[int, int, str]]) -> str: return output - def _find_encoding_token( - self, text: str, search_buffer: str - ) -> tuple[int, int, str]: + def _find_encoding_token(self, text: str, search_buffer: str) -> tuple: """Finds the encoding token for the first character in the text. Args: @@ -129,7 +126,7 @@ def _find_encoding_token( search_buffer (str) Returns: - Tuple[int, int, str]: Token + tuple: Token Tests: >>> lz77_compressor = LZ77Compressor(13, 6) From f5936933a3e58cae34438e5fdd5c5a1216a4afb7 Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Tue, 27 Dec 2022 20:33:25 +0100 Subject: [PATCH 04/17] - fix spelling --- compression/lz77.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compression/lz77.py b/compression/lz77.py index 191bebbe9395..9804d9f7032f 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -35,7 +35,7 @@ class LZ77Compressor: """ - Class containg compress and decompress methods using LZ77 compression algorithm. + Class containing compress and decompress methods using LZ77 compression algorithm. """ def __init__(self, window_size=13, lookahead_buffer_size=6) -> None: From ee06ca09f12cb433752f183b481e81e3015ab820 Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Tue, 27 Dec 2022 20:44:08 +0100 Subject: [PATCH 05/17] - add Python type hints --- compression/lz77.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compression/lz77.py b/compression/lz77.py index 9804d9f7032f..aa0c13a688b1 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -38,7 +38,7 @@ class LZ77Compressor: Class containing compress and decompress methods using LZ77 compression algorithm. """ - def __init__(self, window_size=13, lookahead_buffer_size=6) -> None: + def __init__(self, window_size: int = 13, lookahead_buffer_size: int = 6) -> None: self.window_size = window_size self.lookahead_buffer_size = lookahead_buffer_size self.search_buffer_size = self.window_size - self.lookahead_buffer_size From 3198b334b5162ef0395424af4f7120dcb43cf444 Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Tue, 27 Dec 2022 23:10:52 +0100 Subject: [PATCH 06/17] - add 'Token' class to represent triplet (offset, length, indicator) --- compression/lz77.py | 78 +++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index aa0c13a688b1..12dc4385f537 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -32,6 +32,20 @@ __version__ = "0.1" __author__ = "Lucia Harcekova" +from typing import List + + +class Token: + """ + Dataclass representing triplet called token consisting of length, offset + and indicator. This triplet is used during LZ77 compression. + """ + + def __init__(self, offset: int, length: int, indicator: str) -> None: + self.offset = offset + self.length = length + self.indicator = indicator + class LZ77Compressor: """ @@ -43,21 +57,14 @@ def __init__(self, window_size: int = 13, lookahead_buffer_size: int = 6) -> Non self.lookahead_buffer_size = lookahead_buffer_size self.search_buffer_size = self.window_size - self.lookahead_buffer_size - def compress(self, text: str) -> list: + def compress(self, text: str) -> List[Token]: """This method compresses given string text using LZ77 compression algorithm. Args: text (str): string that's going to be compressed Returns: - output (list): the compressed text - - Tests: - >>> lz77_compressor = LZ77Compressor(13, 6) - >>> lz77_compressor.compress("ababcbababaa") - [(0, 0, 'a'), (0, 0, 'b'), (2, 2, 'c'), (4, 3, 'a'), (2, 2, 'a')] - >>> lz77_compressor.compress("aacaacabcabaaac") - [(0, 0, 'a'), (1, 1, 'c'), (3, 4, 'b'), (3, 3, 'a'), (1, 2, 'c')] + output (List[Token]): the compressed text """ output = [] @@ -68,57 +75,58 @@ def compress(self, text: str) -> list: # find the next encoding phrase # - triplet with offset, length, indicator (the next encoding character) - (offset, length, indicator) = self._find_encoding_token(text, search_buffer) + token = self._find_encoding_token(text, search_buffer) # update the search buffer: # - add new characters from text into it # - check if size exceed the max search buffer size, if so, drop the # oldest elements - search_buffer += text[: length + 1] + search_buffer += text[: token.length + 1] if len(search_buffer) > self.search_buffer_size: search_buffer = search_buffer[-self.search_buffer_size :] # update the text - text = text[length + 1 :] + text = text[token.length + 1 :] # append the token to output - output.append((offset, length, indicator)) + output.append(token) return output - def decompress(self, tokens: list) -> str: - """This method turns the list of tokens consisting of triplets of the form + def decompress(self, tokens: List[Token]) -> str: + """This method turns the List of tokens consisting of triplets of the form (offset, length, char), into an output string. Args: - tokens (list): Tokens (offset, length, char) + tokens (List[Token]): Tokens (offset, length, char) Returns: output (str): The decompressed text Tests: >>> lz77_compressor = LZ77Compressor(13, 6) - >>> lz77_compressor.decompress([(0, 0, 'c'), (0, 0, 'a'), (0, 0, 'b'), \ - (0, 0, 'r'), (3, 1, 'c'), (2, 1, 'd'), (7, 4, 'r'), (3, 5, 'd')]) + >>> lz77_compressor.decompress([Token(0, 0, 'c'), Token(0, 0, 'a'), \ + Token(0, 0, 'b'), Token(0, 0, 'r'), Token(3, 1, 'c'), \ + Token(2, 1, 'd'), Token(7, 4, 'r'), Token(3, 5, 'd')]) 'cabracadabrarrarrad' - >>> lz77_compressor.decompress([(0, 0, 'a'), (0, 0, 'b'), (2, 2, 'c'), \ - (4, 3, 'a'), (2, 2, 'a')]) + >>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(0, 0, 'b'), \ + Token(2, 2, 'c'), Token(4, 3, 'a'), Token(2, 2, 'a')]) 'ababcbababaa' - >>> lz77_compressor.decompress([(0, 0, 'a'), (1, 1, 'c'), (3, 4, 'b'), \ - (3, 3, 'a'), (1, 2, 'c')]) + >>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(1, 1, 'c'), \ + Token(3, 4, 'b'), Token(3, 3, 'a'), Token(1, 2, 'c')]) 'aacaacabcabaaac' """ output = "" - for (offset, length, indicator) in tokens: - for _ in range(length): - output += output[-offset] - output += indicator + for token in tokens: + for _ in range(token.length): + output += output[-token.offset] + output += token.indicator return output - def _find_encoding_token(self, text: str, search_buffer: str) -> tuple: + def _find_encoding_token(self, text: str, search_buffer: str) -> Token: """Finds the encoding token for the first character in the text. Args: @@ -126,21 +134,21 @@ def _find_encoding_token(self, text: str, search_buffer: str) -> tuple: search_buffer (str) Returns: - tuple: Token + (offset, length, indicator) (Token) Tests: >>> lz77_compressor = LZ77Compressor(13, 6) - >>> lz77_compressor._find_encoding_token("abrarrarrad", "abracad") - (7, 4, 'r') - >>> lz77_compressor._find_encoding_token("adabrarrarrad", "cabrac") - (2, 1, 'd') + >>> lz77_compressor._find_encoding_token("abrarrarrad", "abracad").offset + 7 + >>> lz77_compressor._find_encoding_token("adabrarrarrad", "cabrac").length + 1 """ # Initialise result parameters to default values length, offset = 0, 0 if search_buffer == "": - return offset, length, text[length] + return Token(offset, length, text[length]) for i, character in enumerate(search_buffer): found_offset = len(search_buffer) - i @@ -151,7 +159,7 @@ def _find_encoding_token(self, text: str, search_buffer: str) -> tuple: if found_length >= length: offset, length = found_offset, found_length - return offset, length, text[length] + return Token(offset, length, text[length]) def _match_length_from_index( self, text: str, window: str, text_index: int, window_index: int @@ -192,4 +200,4 @@ def _match_length_from_index( TEXT = "cabracadabrarrarrad" compressed_text = lz77_compressor.compress(TEXT) decompressed_text = lz77_compressor.decompress(compressed_text) - assert decompressed_text == TEXT, "The LZ77 agirithm returned the invalid result." + assert decompressed_text == TEXT, "The LZ77 algorithm returned the invalid result." From 41c5a0fd5ab0ae1a2a68ec0c7ba62f21de68e071 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 Dec 2022 22:11:48 +0000 Subject: [PATCH 07/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- compression/lz77.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index 12dc4385f537..66687316f148 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -57,7 +57,7 @@ def __init__(self, window_size: int = 13, lookahead_buffer_size: int = 6) -> Non self.lookahead_buffer_size = lookahead_buffer_size self.search_buffer_size = self.window_size - self.lookahead_buffer_size - def compress(self, text: str) -> List[Token]: + def compress(self, text: str) -> list[Token]: """This method compresses given string text using LZ77 compression algorithm. Args: @@ -93,7 +93,7 @@ def compress(self, text: str) -> List[Token]: return output - def decompress(self, tokens: List[Token]) -> str: + def decompress(self, tokens: list[Token]) -> str: """This method turns the List of tokens consisting of triplets of the form (offset, length, char), into an output string. From 63f28c6f27bb80f1494c8ffaa513366f7ccbc9e2 Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Wed, 28 Dec 2022 07:03:50 +0100 Subject: [PATCH 08/17] - add test, hange type rom List to list --- compression/lz77.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index 66687316f148..feebc77e50f1 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -28,12 +28,11 @@ en.wikipedia.org/wiki/LZ77_and_LZ78 """ +from __future__ import annotations __version__ = "0.1" __author__ = "Lucia Harcekova" -from typing import List - class Token: """ @@ -64,7 +63,17 @@ def compress(self, text: str) -> list[Token]: text (str): string that's going to be compressed Returns: - output (List[Token]): the compressed text + output (list[Token]): the compressed text + + Returns: + (offset, length, indicator) (Token) + + Tests: + >>> lz77_compressor = LZ77Compressor(13, 6) + >>> len(lz77_compressor.compress("ababcbababaa")) + 5 + >>> len(lz77_compressor.compress("aacaacabcabaaac")) + 5 """ output = [] @@ -83,10 +92,10 @@ def compress(self, text: str) -> list[Token]: # oldest elements search_buffer += text[: token.length + 1] if len(search_buffer) > self.search_buffer_size: - search_buffer = search_buffer[-self.search_buffer_size :] + search_buffer = search_buffer[-self.search_buffer_size:] # update the text - text = text[token.length + 1 :] + text = text[token.length + 1:] # append the token to output output.append(token) @@ -94,11 +103,11 @@ def compress(self, text: str) -> list[Token]: return output def decompress(self, tokens: list[Token]) -> str: - """This method turns the List of tokens consisting of triplets of the form + """This method turns the list of tokens consisting of triplets of the form (offset, length, char), into an output string. Args: - tokens (List[Token]): Tokens (offset, length, char) + tokens (list[Token]): Tokens (offset, length, char) Returns: output (str): The decompressed text From 76b22a21680dc615a20f4586bef410233dd84d4a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Dec 2022 06:05:41 +0000 Subject: [PATCH 09/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- compression/lz77.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index feebc77e50f1..98f699bffb87 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -92,10 +92,10 @@ def compress(self, text: str) -> list[Token]: # oldest elements search_buffer += text[: token.length + 1] if len(search_buffer) > self.search_buffer_size: - search_buffer = search_buffer[-self.search_buffer_size:] + search_buffer = search_buffer[-self.search_buffer_size :] # update the text - text = text[token.length + 1:] + text = text[token.length + 1 :] # append the token to output output.append(token) From dd40cf3cc93cb6045b6450a2987c1249907d559e Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Wed, 28 Dec 2022 07:07:52 +0100 Subject: [PATCH 10/17] - remove extra import --- compression/lz77.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index 98f699bffb87..3b6d37cd93d7 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -28,8 +28,6 @@ en.wikipedia.org/wiki/LZ77_and_LZ78 """ -from __future__ import annotations - __version__ = "0.1" __author__ = "Lucia Harcekova" From 153ed96445db46140e23e9fefb0a386050b64c5d Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Wed, 28 Dec 2022 07:14:17 +0100 Subject: [PATCH 11/17] - remove extra types in comments --- compression/lz77.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index 3b6d37cd93d7..06543960c4ed 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -58,13 +58,13 @@ def compress(self, text: str) -> list[Token]: """This method compresses given string text using LZ77 compression algorithm. Args: - text (str): string that's going to be compressed + text: string that's going to be compressed Returns: - output (list[Token]): the compressed text + output: the compressed text Returns: - (offset, length, indicator) (Token) + token (offset, length, indicator) Tests: >>> lz77_compressor = LZ77Compressor(13, 6) @@ -105,10 +105,10 @@ def decompress(self, tokens: list[Token]) -> str: (offset, length, char), into an output string. Args: - tokens (list[Token]): Tokens (offset, length, char) + tokens: list containing triplets (offset, length, char) Returns: - output (str): The decompressed text + output: decompressed text Tests: >>> lz77_compressor = LZ77Compressor(13, 6) @@ -137,11 +137,11 @@ def _find_encoding_token(self, text: str, search_buffer: str) -> Token: """Finds the encoding token for the first character in the text. Args: - text (str) - search_buffer (str) + text + search_buffer Returns: - (offset, length, indicator) (Token) + (offset, length, indicator) Tests: >>> lz77_compressor = LZ77Compressor(13, 6) @@ -175,13 +175,13 @@ def _match_length_from_index( text_index in text and window_index in window. Args: - text (str): _description_ - window (str): sliding window - text_index (int): index of character in text - window_index (int): index of character in sliding window + text: _description_ + window: sliding window + text_index: index of character in text + window_index: index of character in sliding window Returns: - int: The maximum match between text and window, from given indexes. + The maximum match between text and window, from given indexes. Tests: >>> lz77_compressor = LZ77Compressor(13, 6) From 7bf90967c44e360cb5d683dcf733c518973119c3 Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Wed, 28 Dec 2022 08:22:37 +0100 Subject: [PATCH 12/17] - better test --- compression/lz77.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index 06543960c4ed..c9e2c031cb64 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -28,6 +28,7 @@ en.wikipedia.org/wiki/LZ77_and_LZ78 """ + __version__ = "0.1" __author__ = "Lucia Harcekova" @@ -43,6 +44,12 @@ def __init__(self, offset: int, length: int, indicator: str) -> None: self.length = length self.indicator = indicator + def __repr__(self): + return f"({self.offset}, {self.length}, {self.indicator})" + + def __str__(self): + return f"({self.offset}, {self.length}, {self.indicator})" + class LZ77Compressor: """ @@ -64,14 +71,14 @@ def compress(self, text: str) -> list[Token]: output: the compressed text Returns: - token (offset, length, indicator) + (offset, length, indicator) Tests: >>> lz77_compressor = LZ77Compressor(13, 6) - >>> len(lz77_compressor.compress("ababcbababaa")) - 5 - >>> len(lz77_compressor.compress("aacaacabcabaaac")) - 5 + >>> str(lz77_compressor.compress("ababcbababaa")) + '[(0, 0, a), (0, 0, b), (2, 2, c), (4, 3, a), (2, 2, a)]' + >>> str(lz77_compressor.compress("aacaacabcabaaac")) + '[(0, 0, a), (1, 1, c), (3, 4, b), (3, 3, a), (1, 2, c)]' """ output = [] @@ -206,5 +213,6 @@ def _match_length_from_index( # Example TEXT = "cabracadabrarrarrad" compressed_text = lz77_compressor.compress(TEXT) + print(lz77_compressor.compress("ababcbababaa")) decompressed_text = lz77_compressor.decompress(compressed_text) assert decompressed_text == TEXT, "The LZ77 algorithm returned the invalid result." From 52af3cfc75a93c305cadb5999842511359b7ac3f Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Wed, 28 Dec 2022 08:24:09 +0100 Subject: [PATCH 13/17] - edit comments --- compression/lz77.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index c9e2c031cb64..2f11e0f8524a 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -71,7 +71,7 @@ def compress(self, text: str) -> list[Token]: output: the compressed text Returns: - (offset, length, indicator) + Compressed text made of triplets (offset, length, indicator). Tests: >>> lz77_compressor = LZ77Compressor(13, 6) @@ -143,13 +143,6 @@ def decompress(self, tokens: list[Token]) -> str: def _find_encoding_token(self, text: str, search_buffer: str) -> Token: """Finds the encoding token for the first character in the text. - Args: - text - search_buffer - - Returns: - (offset, length, indicator) - Tests: >>> lz77_compressor = LZ77Compressor(13, 6) >>> lz77_compressor._find_encoding_token("abrarrarrad", "abracad").offset From 32982841aaa360bb29617868e0d7056f2c98e81e Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Wed, 28 Dec 2022 17:30:47 +0100 Subject: [PATCH 14/17] - add return types --- compression/lz77.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index 2f11e0f8524a..8be0314171d2 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -44,10 +44,10 @@ def __init__(self, offset: int, length: int, indicator: str) -> None: self.length = length self.indicator = indicator - def __repr__(self): + def __repr__(self) -> str: return f"({self.offset}, {self.length}, {self.indicator})" - def __str__(self): + def __str__(self) -> str: return f"({self.offset}, {self.length}, {self.indicator})" From b530862009442ea6009f9006b3531e6c1b99637b Mon Sep 17 00:00:00 2001 From: LuciaHarcekova Date: Wed, 28 Dec 2022 18:00:54 +0100 Subject: [PATCH 15/17] - add tests for __str__ and __repr__ --- compression/lz77.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/compression/lz77.py b/compression/lz77.py index 8be0314171d2..68161b3dabf1 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -28,6 +28,7 @@ en.wikipedia.org/wiki/LZ77_and_LZ78 """ +from __future__ import annotations __version__ = "0.1" __author__ = "Lucia Harcekova" @@ -45,9 +46,21 @@ def __init__(self, offset: int, length: int, indicator: str) -> None: self.indicator = indicator def __repr__(self) -> str: + """ + Tests: + >>> token = Token(5, 6, "a") + >>> token.__repr__() + '(5, 6, a)' + """ return f"({self.offset}, {self.length}, {self.indicator})" def __str__(self) -> str: + """ + Tests: + >>> token = Token(5, 6, "a") + >>> token.__str__() + '(5, 6, a)' + """ return f"({self.offset}, {self.length}, {self.indicator})" From b891abf349c07882bc2b26579a65b68cd16bbc28 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Wed, 28 Dec 2022 18:30:52 +0100 Subject: [PATCH 16/17] Update lz77.py --- compression/lz77.py | 94 +++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/compression/lz77.py b/compression/lz77.py index 68161b3dabf1..8b981ed83afc 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -28,38 +28,30 @@ en.wikipedia.org/wiki/LZ77_and_LZ78 """ -from __future__ import annotations + +from dataclasses import dataclass __version__ = "0.1" __author__ = "Lucia Harcekova" +@dataclass class Token: """ Dataclass representing triplet called token consisting of length, offset and indicator. This triplet is used during LZ77 compression. """ - - def __init__(self, offset: int, length: int, indicator: str) -> None: - self.offset = offset - self.length = length - self.indicator = indicator + offset: int + length: int + indicator: str def __repr__(self) -> str: """ - Tests: - >>> token = Token(5, 6, "a") - >>> token.__repr__() - '(5, 6, a)' - """ - return f"({self.offset}, {self.length}, {self.indicator})" - - def __str__(self) -> str: - """ - Tests: - >>> token = Token(5, 6, "a") - >>> token.__str__() - '(5, 6, a)' + >>> token = Token(1, 2, "c") + >>> repr(token) + '(1, 2, c)' + >>> str(token) + '(1, 2, c)' """ return f"({self.offset}, {self.length}, {self.indicator})" @@ -75,23 +67,20 @@ def __init__(self, window_size: int = 13, lookahead_buffer_size: int = 6) -> Non self.search_buffer_size = self.window_size - self.lookahead_buffer_size def compress(self, text: str) -> list[Token]: - """This method compresses given string text using LZ77 compression algorithm. + """ + Compress the given string text using LZ77 compression algorithm. Args: - text: string that's going to be compressed - - Returns: - output: the compressed text + text: string to be compressed Returns: - Compressed text made of triplets (offset, length, indicator). + output: the compressed text as a list of Tokens - Tests: - >>> lz77_compressor = LZ77Compressor(13, 6) - >>> str(lz77_compressor.compress("ababcbababaa")) - '[(0, 0, a), (0, 0, b), (2, 2, c), (4, 3, a), (2, 2, a)]' - >>> str(lz77_compressor.compress("aacaacabcabaaac")) - '[(0, 0, a), (1, 1, c), (3, 4, b), (3, 3, a), (1, 2, c)]' + >>> lz77_compressor = LZ77Compressor() + >>> str(lz77_compressor.compress("ababcbababaa")) + '[(0, 0, a), (0, 0, b), (2, 2, c), (4, 3, a), (2, 2, a)]' + >>> str(lz77_compressor.compress("aacaacabcabaaac")) + '[(0, 0, a), (1, 1, c), (3, 4, b), (3, 3, a), (1, 2, c)]' """ output = [] @@ -121,8 +110,8 @@ def compress(self, text: str) -> list[Token]: return output def decompress(self, tokens: list[Token]) -> str: - """This method turns the list of tokens consisting of triplets of the form - (offset, length, char), into an output string. + """ + Convert the list of tokens into an output string. Args: tokens: list containing triplets (offset, length, char) @@ -131,16 +120,16 @@ def decompress(self, tokens: list[Token]) -> str: output: decompressed text Tests: - >>> lz77_compressor = LZ77Compressor(13, 6) - >>> lz77_compressor.decompress([Token(0, 0, 'c'), Token(0, 0, 'a'), \ - Token(0, 0, 'b'), Token(0, 0, 'r'), Token(3, 1, 'c'), \ - Token(2, 1, 'd'), Token(7, 4, 'r'), Token(3, 5, 'd')]) + >>> lz77_compressor = LZ77Compressor() + >>> lz77_compressor.decompress([Token(0, 0, 'c'), Token(0, 0, 'a'), + ... Token(0, 0, 'b'), Token(0, 0, 'r'), Token(3, 1, 'c'), + ... Token(2, 1, 'd'), Token(7, 4, 'r'), Token(3, 5, 'd')]) 'cabracadabrarrarrad' - >>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(0, 0, 'b'), \ - Token(2, 2, 'c'), Token(4, 3, 'a'), Token(2, 2, 'a')]) + >>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(0, 0, 'b'), + ... Token(2, 2, 'c'), Token(4, 3, 'a'), Token(2, 2, 'a')]) 'ababcbababaa' - >>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(1, 1, 'c'), \ - Token(3, 4, 'b'), Token(3, 3, 'a'), Token(1, 2, 'c')]) + >>> lz77_compressor.decompress([Token(0, 0, 'a'), Token(1, 1, 'c'), + ... Token(3, 4, 'b'), Token(3, 3, 'a'), Token(1, 2, 'c')]) 'aacaacabcabaaac' """ @@ -157,17 +146,28 @@ def _find_encoding_token(self, text: str, search_buffer: str) -> Token: """Finds the encoding token for the first character in the text. Tests: - >>> lz77_compressor = LZ77Compressor(13, 6) + >>> lz77_compressor = LZ77Compressor() >>> lz77_compressor._find_encoding_token("abrarrarrad", "abracad").offset 7 >>> lz77_compressor._find_encoding_token("adabrarrarrad", "cabrac").length 1 + >>> lz77_compressor._find_encoding_token("abc", "xyz").offset + 0 + >>> lz77_compressor._find_encoding_token("", "xyz").offset + Traceback (most recent call last): + ... + ValueError: We need some text to work with. + >>> lz77_compressor._find_encoding_token("abc", "").offset + 0 """ + if not text: + raise ValueError("We need some text to work with.") + # Initialise result parameters to default values length, offset = 0, 0 - if search_buffer == "": + if not search_buffer: return Token(offset, length, text[length]) for i, character in enumerate(search_buffer): @@ -200,11 +200,11 @@ def _match_length_from_index( >>> lz77_compressor = LZ77Compressor(13, 6) >>> lz77_compressor._match_length_from_index("rarrad", "adabrar", 0, 4) 5 - >>> lz77_compressor._match_length_from_index("adabrarrarrad", \ - "cabrac", 0, 1) + >>> lz77_compressor._match_length_from_index("adabrarrarrad", + ... "cabrac", 0, 1) 1 """ - if text == "" or text[text_index] != window[window_index]: + if not text or text[text_index] != window[window_index]: return 0 return 1 + self._match_length_from_index( text, window + text[text_index], text_index + 1, window_index + 1 @@ -212,7 +212,9 @@ def _match_length_from_index( if __name__ == "__main__": + from doctest import testmod + testmod() # Initialize compressor class lz77_compressor = LZ77Compressor(window_size=13, lookahead_buffer_size=6) From bda07c5d29039caf23866846c8dcb40c093bab48 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Dec 2022 17:31:58 +0000 Subject: [PATCH 17/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- compression/lz77.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compression/lz77.py b/compression/lz77.py index 8b981ed83afc..7c1a6f6a4c19 100644 --- a/compression/lz77.py +++ b/compression/lz77.py @@ -41,6 +41,7 @@ class Token: Dataclass representing triplet called token consisting of length, offset and indicator. This triplet is used during LZ77 compression. """ + offset: int length: int indicator: str @@ -163,7 +164,7 @@ def _find_encoding_token(self, text: str, search_buffer: str) -> Token: if not text: raise ValueError("We need some text to work with.") - + # Initialise result parameters to default values length, offset = 0, 0