I think I can do better (sorry a bit long):
import java.io.File;
import java.io.FileInputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class LZ78 {
/**
* Uses the LZ78 compression algorithm to approximate the Kolmogorov
* complexity of a String
*
* @param s
* @return the approximate Kolmogorov complexity
*/
public static double kComplexity(String s) {
Set<String> dictionary = new HashSet<String>();
StringBuilder w = new StringBuilder();
double comp = 0;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (dictionary.contains(w.toString() + c)) {
w.append(c);
} else {
comp++;
dictionary.add(w.toString() + c);
w = new StringBuilder();
}
}
if (w.length() != 0) {
comp++;
}
return comp;
}
private static boolean equal(Object o1, Object o2) {
return o1 == o2 || (o1 != null && o1.equals(o2));
}
public static final class FList<T> {
public final FList<T> head;
public final T tail;
private final int hashCodeValue;
public FList(FList<T> head, T tail) {
this.head = head;
this.tail = tail;
int v = head != null ? head.hashCodeValue : 0;
hashCodeValue = v * 31 + (tail != null ? tail.hashCode() : 0);
}
@Override
public boolean equals(Object obj) {
if (obj instanceof FList<?>) {
FList<?> that = (FList<?>) obj;
return equal(this.head, that.head)
&& equal(this.tail, that.tail);
}
return false;
}
@Override
public int hashCode() {
return hashCodeValue;
}
@Override
public String toString() {
return head + ", " + tail;
}
}
public static final class FListChar {
public final FListChar head;
public final char tail;
private final int hashCodeValue;
public FListChar(FListChar head, char tail) {
this.head = head;
this.tail = tail;
int v = head != null ? head.hashCodeValue : 0;
hashCodeValue = v * 31 + tail;
}
@Override
public boolean equals(Object obj) {
if (obj instanceof FListChar) {
FListChar that = (FListChar) obj;
return equal(this.head, that.head) && this.tail == that.tail;
}
return false;
}
@Override
public int hashCode() {
return hashCodeValue;
}
@Override
public String toString() {
return head + ", " + tail;
}
}
public static double kComplexity2(String s) {
Map<FList<Character>, FList<Character>> dictionary =
new HashMap<FList<Character>, FList<Character>>();
FList<Character> w = null;
double comp = 0;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
FList<Character> w1 = new FList<Character>(w, c);
FList<Character> ex = dictionary.get(w1);
if (ex != null) {
w = ex;
} else {
comp++;
dictionary.put(w1, w1);
w = null;
}
}
if (w != null) {
comp++;
}
return comp;
}
public static double kComplexity3(String s) {
Map<FListChar, FListChar> dictionary =
new HashMap<FListChar, FListChar>(1024);
FListChar w = null;
double comp = 0;
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
FListChar w1 = new FListChar(w, c);
FListChar ex = dictionary.get(w1);
if (ex != null) {
w = ex;
} else {
comp++;
dictionary.put(w1, w1);
w = null;
}
}
if (w != null) {
comp++;
}
return comp;
}
public static void main(String[] args) throws Exception {
File f = new File("methods.txt");
byte[] data = new byte[(int) f.length()];
FileInputStream fin = new FileInputStream(f);
int len = fin.read(data);
fin.close();
final String test = new String(data, 0, len);
final int n = 100;
ExecutorService exec = Executors.newFixedThreadPool(1);
exec.submit(new Runnable() {
@Override
public void run() {
long t = System.nanoTime();
double value = 0;
for (int i = 0; i < n; i++) {
value += kComplexity(test);
}
System.out.printf("kComplexity: %.3f; Time: %d ms%n",
value / n, (System.nanoTime() - t) / 1000000);
}
});
exec.submit(new Runnable() {
@Override
public void run() {
long t = System.nanoTime();
double value = 0;
for (int i = 0; i < n; i++) {
value += kComplexity2(test);
}
System.out.printf("kComplexity2: %.3f; Time: %d ms%n", value
/ n, (System.nanoTime() - t) / 1000000);
}
});
exec.submit(new Runnable() {
@Override
public void run() {
long t = System.nanoTime();
double value = 0;
for (int i = 0; i < n; i++) {
value += kComplexity3(test);
}
System.out.printf("kComplexity3: %.3f; Time: %d ms%n", value
/ n, (System.nanoTime() - t) / 1000000);
}
});
exec.shutdown();
}
}
Results:
kComplexity: 41546,000; Time: 17028 ms
kComplexity2: 41546,000; Time: 6555 ms
kComplexity3: 41546,000; Time: 5971 ms
Edit Peer pressure: How does it work?
Franky, have no idea, it just seemed to be a good way to speed up things. I have to figure it out too, so here we go.
It was an observation that the original code made lot of string appends, however, replacing it with a LinkedList<String>
wouldn't help as there is a constant pressure for looking up collections in the hash-table - every time, the hashCode() and equals() are used it needs to traverse the entire list.
But how can I make sure the code doesn't perform this unnecessary? The answer: Immutability - if your class is immutable, then at least the hashCode is constant, therefore, can be pre-computed. The equality check can be shortcutted too - but in worst case scenario it will still traverse the entire collection.
That's nice, but then how do you 'modify' an immutable class. No you don't, you create a new one every time a different content is needed. However, when you look close at the contents of the dictionary, you'll recognize it contains redundant information: []a
, [abc]d
, [abc]e
, [abcd]f
. So why not just store the head as a pointer to a previously seen pattern and have a tail for the current character?
Exactly. Using immutability and back-references you can spare space and time, and even the garbage collector will love you too. One speciality of my solution is that in F# and Haskell, the list uses a head:[tail] signature - the head is the element type and the tail is a pointer to the next collection. In this question, the opposite was required as the lists grow at the tail side.
From here on its just some further optimization - for example, use a concrete char
as the tail type to avoid constant char autoboxing of the generic version.
One drawback of my solution is that it utilizes recursion when calculating the aspects of the list. For relatively small list it's fine, but longer list could require you to increase the Thread stack size on which your computation runs. In theory, with Java 7's tail call optimization, my code can be rewritten in such a way that it allows the JIT to perform the optimization (or is it already so? Hard to tell).