๐Ÿ“ฆ ryzokuken / compression

๐Ÿ“„ mod.rs ยท 106 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106#[derive(Debug)]
struct Node {
    offset: usize,
    length: usize,
    symbol: u8,
}

impl Node {
    fn serialize(&self) -> Vec<u8> {
        let mut out = Vec::new();
        out.extend_from_slice(&self.offset.to_be_bytes());
        out.extend_from_slice(&self.length.to_be_bytes());
        out.push(self.symbol);
        out
    }

    fn deserialize(bytes: &[u8]) -> Node {
        Node {
            offset: usize::from_be_bytes(from_slice(&bytes[0..8])),
            length: usize::from_be_bytes(from_slice(&bytes[8..16])),
            symbol: bytes[16],
        }
    }

    fn blank(symbol: u8) -> Node {
        Node {
            offset: 0,
            length: 0,
            symbol,
        }
    }

    fn new(offset: usize, length: usize) -> Node {
        Node {
            offset,
            length,
            symbol: 0u8,
        }
    }
}

fn from_slice(bytes: &[u8]) -> [u8; 8] {
    let mut arr = [0; 8];
    arr.copy_from_slice(&bytes[..]);
    arr
}

fn deserialize_nodes(data: &[u8]) -> Vec<Node> {
    let mut nodes = Vec::new();
    for block in data.chunks(17) {
        println!("{:?}", block);
        nodes.push(Node::deserialize(block));
    }
    nodes
}

fn serialize_nodes(nodes: &[Node]) -> Vec<u8> {
    let mut out = Vec::new();
    for node in nodes.iter() {
        out.append(&mut node.serialize());
    }
    out
}

pub fn encode(data: &[u8]) -> Vec<u8> {
    let mut nodes = Vec::new();
    nodes.push(Node::blank(data[0]));
    let mut idx = 1;
    'outer: while idx < data.len() {
        let dict = &data[0..idx];
        let mut best = Node::blank(data[idx]);
        let mut len = 1;
        'inner: while len <= idx {
            let buf = &data[idx..idx + len];
            // Naive: break dict into windows of length buf.len() and check if
            // any of them is equal to buf.
            let pos = dict.windows(buf.len()).position(|win| win == buf);
            if pos.is_some() {
                best = Node::new(pos.unwrap(), buf.len());
                // If there's more bytes in the lookahead buffer, keep going.
                if idx + len < data.len() {
                    best.symbol = data[idx + len];
                    len += 1;
                    continue 'inner;
                }
            }
            // If there weren't any matches or we ran out of bytes in the
            // lookahead buffer, call it a day.
            idx += best.length + 1;
            nodes.push(best);
            continue 'outer;
        }
    }
    serialize_nodes(nodes.as_slice())
}

#[cfg(test)]
mod tests {
    #[test]
    fn basic() {
        let string = b"abcabdca";
        let _compressed = super::encode(string);
        println!("{:?}", _compressed)
    }
}