Skip to content

Commit d0e5916

Browse files
committed
new file: Compression/High-level/sbwt_file_compression.pl
1 parent 747bc1c commit d0e5916

File tree

2 files changed

+225
-0
lines changed

2 files changed

+225
-0
lines changed
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
#!/usr/bin/perl
2+
3+
# Author: Daniel "Trizen" Șuteu
4+
# Date: 09 November 2024
5+
# https://github.com/trizen
6+
7+
# Compress/decompress files using SWAP transform + LZB + Burrows-Wheeler Transform (BWT) + Move-to-Front Transform + Run-length encoding + Huffman coding.
8+
9+
# Reference:
10+
# Data Compression (Summer 2023) - Lecture 13 - BZip2
11+
# https://youtube.com/watch?v=cvoZbBZ3M2A
12+
13+
use 5.036;
14+
use Getopt::Std qw(getopts);
15+
use File::Basename qw(basename);
16+
use Compression::Util qw(:all);
17+
use POSIX qw(ceil);
18+
19+
use constant {
20+
PKGNAME => 'SBWT',
21+
VERSION => '0.01',
22+
FORMAT => 'sbwt',
23+
24+
CHUNK_SIZE => 1 << 17,
25+
};
26+
27+
# Container signature
28+
use constant SIGNATURE => uc(FORMAT) . chr(1);
29+
30+
sub usage {
31+
my ($code) = @_;
32+
print <<"EOH";
33+
usage: $0 [options] [input file] [output file]
34+
35+
options:
36+
-e : extract
37+
-i <filename> : input filename
38+
-o <filename> : output filename
39+
-r : rewrite output
40+
41+
-v : version number
42+
-h : this message
43+
44+
examples:
45+
$0 document.txt
46+
$0 document.txt archive.${\FORMAT}
47+
$0 archive.${\FORMAT} document.txt
48+
$0 -e -i archive.${\FORMAT} -o document.txt
49+
50+
EOH
51+
52+
exit($code // 0);
53+
}
54+
55+
sub version {
56+
printf("%s %s\n", PKGNAME, VERSION);
57+
exit;
58+
}
59+
60+
sub valid_archive {
61+
my ($fh) = @_;
62+
63+
if (read($fh, (my $sig), length(SIGNATURE), 0) == length(SIGNATURE)) {
64+
$sig eq SIGNATURE || return;
65+
}
66+
67+
return 1;
68+
}
69+
70+
sub main {
71+
my %opt;
72+
getopts('ei:o:vhr', \%opt);
73+
74+
$opt{h} && usage(0);
75+
$opt{v} && version();
76+
77+
my ($input, $output) = @ARGV;
78+
$input //= $opt{i} // usage(2);
79+
$output //= $opt{o};
80+
81+
my $ext = qr{\.${\FORMAT}\z}io;
82+
if ($opt{e} || $input =~ $ext) {
83+
84+
if (not defined $output) {
85+
($output = basename($input)) =~ s{$ext}{}
86+
|| die "$0: no output file specified!\n";
87+
}
88+
89+
if (not $opt{r} and -e $output) {
90+
print "'$output' already exists! -- Replace? [y/N] ";
91+
<STDIN> =~ /^y/i || exit 17;
92+
}
93+
94+
decompress_file($input, $output)
95+
|| die "$0: error: decompression failed!\n";
96+
}
97+
elsif ($input !~ $ext || (defined($output) && $output =~ $ext)) {
98+
$output //= basename($input) . '.' . FORMAT;
99+
compress_file($input, $output)
100+
|| die "$0: error: compression failed!\n";
101+
}
102+
else {
103+
warn "$0: don't know what to do...\n";
104+
usage(1);
105+
}
106+
}
107+
108+
sub swap_transform ($text, $extra = 1) {
109+
110+
my @bits;
111+
my @arr = unpack('C*', $text);
112+
my $k = 0;
113+
114+
foreach my $i (1 .. $#arr) {
115+
if ($arr[$i] < $arr[$i - 1 - $k]) {
116+
push @bits, 1;
117+
unshift @arr, splice(@arr, $i, 1);
118+
++$k if $extra;
119+
}
120+
else {
121+
push @bits, 0;
122+
}
123+
}
124+
125+
return (pack('C*', @arr), \@bits);
126+
}
127+
128+
sub reverse_swap_transform ($text, $bits) {
129+
my @arr = unpack('C*', $text);
130+
131+
for (my $i = $#arr ; $i >= 0 ; --$i) {
132+
if ($bits->[$i - 1] == 1) {
133+
splice(@arr, $i, 0, shift(@arr));
134+
}
135+
}
136+
137+
pack('C*', @arr);
138+
}
139+
140+
# Compress file
141+
sub compress_file ($input, $output) {
142+
143+
open my $fh, '<:raw', $input
144+
or die "Can't open file <<$input>> for reading: $!";
145+
146+
my $header = SIGNATURE;
147+
148+
# Open the output file for writing
149+
open my $out_fh, '>:raw', $output
150+
or die "Can't open file <<$output>> for write: $!";
151+
152+
# Print the header
153+
print $out_fh $header;
154+
155+
# Compress data
156+
while (read($fh, (my $chunk), CHUNK_SIZE)) {
157+
158+
local $Compression::Util::LZ_MIN_LEN = 512;
159+
my ($t, $bits) = swap_transform(lzb_compress($chunk, \&lzss_encode_fast), 0);
160+
my $vrle_bits = binary_vrl_encode(join('', @$bits));
161+
162+
if (length($vrle_bits) < scalar @$bits) {
163+
say STDERR "With VLRE: ", length($vrle_bits), " < ", scalar(@$bits);
164+
print $out_fh chr(1);
165+
}
166+
else {
167+
say STDERR "Without VRLE: ", length($vrle_bits), " > ", scalar(@$bits);
168+
$vrle_bits = join('', @$bits);
169+
print $out_fh chr(0);
170+
}
171+
172+
print $out_fh pack('N', length $vrle_bits);
173+
174+
my ($bwt, $idx) = bwt_encode($t);
175+
print $out_fh pack('B*', $vrle_bits);
176+
177+
my ($mtf, $alphabet) = mtf_encode(string2symbols($bwt));
178+
my $rle = zrle_encode($mtf);
179+
print $out_fh (pack('N', $idx) . encode_alphabet($alphabet) . create_huffman_entry($rle));
180+
}
181+
182+
# Close the file
183+
close $out_fh;
184+
}
185+
186+
# Decompress file
187+
sub decompress_file ($input, $output) {
188+
189+
# Open and validate the input file
190+
open my $fh, '<:raw', $input
191+
or die "Can't open file <<$input>> for reading: $!";
192+
193+
valid_archive($fh) || die "$0: file `$input' is not a \U${\FORMAT}\E v${\VERSION} archive!\n";
194+
195+
# Open the output file
196+
open my $out_fh, '>:raw', $output
197+
or die "Can't open file <<$output>> for writing: $!";
198+
199+
while (!eof($fh)) {
200+
201+
my $with_vrle = ord(getc($fh));
202+
my $bits_len = bytes2int($fh, 4);
203+
my $bits = read_bits($fh, $bits_len);
204+
205+
$bits = binary_vrl_decode($bits) if $with_vrle;
206+
207+
my $idx = bytes2int($fh, 4);
208+
my $alphabet = decode_alphabet($fh);
209+
210+
my $rle = decode_huffman_entry($fh);
211+
my $mtf = zrle_decode($rle);
212+
my $bwt = mtf_decode($mtf, $alphabet);
213+
my $data = bwt_decode(pack('C*', @$bwt), $idx);
214+
215+
print $out_fh lzb_decompress(reverse_swap_transform($data, [split(//, $bits)]));
216+
}
217+
218+
# Close the file
219+
close $fh;
220+
close $out_fh;
221+
}
222+
223+
main();
224+
exit(0);

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ A nice collection of day-to-day Perl scripts.
146146
* [Myzlib file compression](./Compression/High-level/myzlib_file_compression.pl)
147147
* [Rablz file compression](./Compression/High-level/rablz_file_compression.pl)
148148
* [Rlzss file compression](./Compression/High-level/rlzss_file_compression.pl)
149+
* [Sbwt file compression](./Compression/High-level/sbwt_file_compression.pl)
149150
* [Xz file compression](./Compression/High-level/xz_file_compression.pl)
150151
* [Zlib file compression](./Compression/High-level/zlib_file_compression.pl)
151152
* [Zstd file compression](./Compression/High-level/zstd_file_compression.pl)

0 commit comments

Comments
 (0)