-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathburrows-wheeler_transform_symbolic.sf
107 lines (83 loc) · 2.45 KB
/
burrows-wheeler_transform_symbolic.sf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/ruby
# Author: Trizen
# Date: 14 June 2023
# Edit: 15 June 2023
# https://github.com/trizen
# Burrows–Wheeler transform, implemented to work on an array of symbols.
# https://rosettacode.org/wiki/Burrows–Wheeler_transform
# Reference:
# Data Compression (Summer 2023) - Lecture 12 - The Burrows-Wheeler Transform (BWT)
# https://youtube.com/watch?v=rQ7wwh4HRZM
func bwt_cyclic (s) { # O(n) space
var len = s.len
gather { # check if all the symbols are the same
take(true)
s.each_cons(2, {|a,b|
if (a != b) {
take(false)
break
}
})
} == [true] && return @(^len)
@(^len) -> sort {|i,j|
while (s[i] == s[j]) {
i %= len if (++i >= len)
j %= len if (++j >= len)
}
s[i] <=> s[j]
}
}
func bwt_encode_quadratic(Array s) { # O(n^2) space
#var bwt = s.len.of{|i| s.slice(i) + s.slice(0, i) }.sort
var bwt = s.len.of{|i| s.rotate(i) }.sort
var ret = bwt.map { .last }
var idx = bwt.index(s)
return (ret, idx)
}
func bwt_encode(Array s) {
var bwt = bwt_cyclic(s)
var ret = bwt.map {|i| s.slice(i-1, 1)... }
var idx = bwt.first_index_by { .is_zero }
return (ret, idx)
}
func bwt_decode(Array bwt, Number idx) { # fast inversion
var tail = bwt
var head = bwt.sort
var indices = []
tail.each_kv {|i,v|
indices[v] := [] << i
}
var table = []
head.each_kv {|i,b|
table[i] = indices[b].shift
}
var dec = []
var i = idx
head.len.times {
dec << head[i]
i = table[i]
}
return dec
}
var tests = [
"banana", "appellee", "dogwood", "TOBEORNOTTOBEORTOBEORNOT"
"SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES", "PINEAPPLE",
"","a","aa","aabb","aaaaaaaaaaaa","aaaaaaaaaaaab",
"baaaaaaaaaaaa","aaaaaabaaaaaa","aaaaaaabaaaaa",
File(__FILE__).read(:raw),
]
tests.each { |str|
var (enc, idx) = bwt_encode(str.bytes)
if (enc.len < 1024) {
say "BWT(#{str.dump}) = (#{enc.join_bytes.dump}, #{idx})"
}
var dec = bwt_decode(enc, idx)
assert_eq(str, dec.pack('C*'))
}
__END__
BWT("banana") = ("nnbaaa", 3)
BWT("appellee") = ("eelplepa", 0)
BWT("dogwood") = ("odoodwg", 1)
BWT("TOBEORNOTTOBEORTOBEORNOT") = ("OOOBBBRRTTTEEENNOOORTTOO", 20)
BWT("SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES") = ("TEXYDST.E.IXIXIXXSSMPPS.B..E.S.EUSFXDIIOIIIT", 29)
BWT("PINEAPPLE") = ("ENLPPIEPA", 6)