Hi, there is this morphological analyzer (open source, written in OCml) named ocamorph. download and make instructions here
The java binding is buggy and I'll have to fix it and after a few hours of struggle now it seems to me it'll take a few days to fix it as I'm not familiar with C, JNI, OCml and this particular software.
Here you can see that for a small file (subtitles_136.hu.tok) it works but for a larger file (Tolkien_1.hu.tok) "Segmentation error" is thrown:
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ java -Djava.library.path=./output/ -cp output mokk.nlp.ocamorph.FileStemmer $HULEXICON src/java/mokk/nlp/ocamorph/cache2.txt > src/java/mokk/nlp/ocamorph/subtitles_136.hu.stem < src/java/mokk/nlp/ocamorph/subtitles_136.hu.tok
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ java -Djava.library.path=./output/ -cp output mokk.nlp.ocamorph.FileStemmer $HULEXICON src/java/mokk/nlp/ocamorph/cache.txt > src/java/mokk/nlp/ocamorph/Tolkien_1.en.stem < src/java/mokk/nlp/ocamorph/Tolkien_1.en.tok
Segmentation fault
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ ls -l src/java/mokk/nlp/ocamorph/
total 2116
-rw-rw-r-- 1 bpgergo breka 8505 2009-09-22 13:53 cache2.txt
-rw-rw-r-- 1 bpgergo breka 65 2009-07-07 18:48 Compounds.java
drwxrwxr-x 2 bpgergo breka 4096 2009-09-22 13:54 CVS
-rw-rw-r-- 1 bpgergo breka 5888 2009-09-18 17:19 FileStemmer.java
-rw-rw-r-- 1 bpgergo breka 77 2009-07-07 18:48 Guess.java
-rw-rw-r-- 1 bpgergo breka 953 2009-08-31 18:58 IOcamorphStemmer.java
-rw-rw-r-- 1 bpgergo breka 5419 2009-08-31 18:58 OcamorphCachedStemmer.java
-rw-rw-r-- 1 bpgergo breka 2836 2009-08-03 16:00 OcamorphStemmer.java
-rw-rw-r-- 1 bpgergo breka 4612 2009-09-22 12:51 OcamorphWrapper.java
-rw-rw-r-- 1 bpgergo breka 6731 2009-09-22 13:53 subtitles_136.hu.stem
-rw-rw-r-- 1 bpgergo breka 7356 2009-09-20 21:12 subtitles_136.hu.tok
-rw-rw-r-- 1 bpgergo breka 2907 2009-09-18 17:22 Tester.java
-rw-rw-r-- 1 bpgergo breka 0 2009-09-22 13:53 Tolkien_1.en.stem
-rw-rw-r-- 1 bpgergo breka 1033059 2009-09-17 16:09 Tolkien_1.en.tok
-rw-rw-r-- 1 bpgergo breka 0 2009-09-22 13:14 Tolkien_1.hu.stem
-rw-rw-r-- 1 bpgergo breka 1041968 2009-09-17 16:09 Tolkien_1.hu.tok
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $
This is the C part of the Java binding (/ocamorph/src/bindings/java/src/c/hunmorph_jnistub.c). This might be the buggy part, thanks for any hint or help for finding the bug:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mokk_nlp_ocamorph_OcamorphWrapper.h"
#include "ocamorph.h"
#define MAX_ANALYSIS 100
#define ANALYSIS_MAXLEN 100
// initialize the analysis string
char analysis[ANALYSIS_MAXLEN];
// initialize input buffer
char buffer[500];
char* analyses[MAX_ANALYSIS];
jmethodID MID_InstanceMethodCall_callback;
JNIEXPORT void JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_initIDs
(JNIEnv *env, jclass cls) {
MID_InstanceMethodCall_callback =
(*env)->GetMethodID(env, cls, "callback", "([B)V");
}
JNIEXPORT jlong JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_init
(JNIEnv * env, jobject obj, jstring bin_arg) {
/* Convert to UTF8 */
const char *bin_file = (*env)->GetStringUTFChars(env, bin_arg, JNI_FALSE);
ocamorph_startup();
ocamorph_engine engine = init_from_bin(bin_file,0/*Don't pass the stupid no_caps argument*/);
/* Release created UTF8 string */
(*env)->ReleaseStringUTFChars(env, bin_arg, bin_file);
int i;
for (i=0; i<MAX_ANALYSIS;i++) {
analyses[i] = (char *) malloc(ANALYSIS_MAXLEN * sizeof(char));
};
return (jlong) engine;
}
JNIEXPORT jlong JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_make_1analyzer
(JNIEnv *env, jobject obj, jlong engine , jint blocking, jint compunds, jint stop_at_first, jint guess) {
ocamorph_engine analyzer = make_analyzer((ocamorph_engine) engine, blocking, compunds, stop_at_first, guess);
return (jlong) analyzer;
}
JNIEXPORT void JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_analyze
(JNIEnv * env, jobject obj, jlong analyzer, jbyteArray word) {
ocamorph_engine analyzerc = (ocamorph_engine) analyzer;
/* Convert to UTF8 */
// const char *wordc = (*env)->GetStringUTFChars(env, word, JNI_FALSE);
//char *wordc = (char *) (*env)->GetByteArrayElements( env, word, 0);
const int maxInputLength = 1000;
char wordc[maxInputLength];
jsize len = (*env)->GetArrayLength(env,word);
if (len>=maxInputLength) { len = maxInputLength-1; }
if (len!=0)
{
(*env)->GetByteArrayRegion(env,word,0,len,(jbyte*)wordc);
}
wordc[len] = '\0';
int n = analyze(analyzerc,wordc,analyses,MAX_ANALYSIS, ANALYSIS_MAXLEN);
int i;
for (i=0; i < n; ++i) {
// jstring ana = (*env)->NewStringUTF(env, analyses[i]);
char* ana = analyses[i];
jbyteArray jb=(*env)->NewByteArray(env, strlen(ana));
(*env)->SetByteArrayRegion(env, jb, 0, strlen(ana), (jbyte *)ana);
(*env)->CallVoidMethod(env, obj, MID_InstanceMethodCall_callback, jb);
}
// (*env)->ReleaseStringUTFChars(env, word, wordc);
}
And here is the Java part (/ocamorph/src/bindings/java/src/java/mokk/nlp/ocamorph/OcamorphWrapper.java):
package mokk.nlp.ocamorph;
import java.io.UnsupportedEncodingException;
import java.util.LinkedList;
import java.util.List;
/**
* JNI interface for Ocamorph. Constructor loads ocamorph engine and a specified binary resource.
*
* @author bpgergo
*
*/
public class OcamorphWrapper {
private long analyzerId;
private long engineId;
private native static void initIDs();
private native long init(String bin);
// const ocamorph_engine engine, const int blocking, const int compounds,
// const int stop_at_first, const int guess
// valami hiba van az ocamorph-ban, mert a stop_at_first vezerli az
// osszetettszosagot
private native long make_analyzer(long engine, int blocking, int compounds,
int stop_at_first, int guess);
private native void analyze(long analyzer, byte[] word);
static {
//TODO FIXME how to define the library dynamically?
System.loadLibrary("ocamorph");
initIDs();
}
/**
* the encoding required by the ocamorph lib
*/
private static String encoding = "ISO-8859-2";
//private static boolean debug = false;
/**
* analyze result (the callback will add the result strings)
*/
private List<String> analyzeResult = null;
/**
* Loads a new Ocamorph engine, using the given binary resource and the arguments.
*
* @param bin
* @param blocking
* @param stopAtFirst
* @param compounds
* @param guess
*/
public OcamorphWrapper(String bin, boolean blocking, boolean stopAtFirst,
Compounds compounds, Guess guess) {
super();
engineId = init(bin);
int comp = compounds2Code(compounds);
int gu = guessToCode(guess);
analyzerId = make_analyzer(engineId, boolean2Code(blocking), boolean2Code(stopAtFirst),
comp, gu);
//debug("engineId:"+engineId);
//debug("analyzerId:"+analyzerId);
//debug = false;
}
/**
* This is the interface method for ocamorph analysis for the java side.
* @param ba
*/
public List<String> analyze(String word) {
//debug("analyze:");
analyzeResult = new LinkedList<String>();
byte[] ba = null;
try {
ba = word.getBytes(encoding);
} catch (UnsupportedEncodingException e1) {
System.err
.println("Ocamorph analyze UnsupportedEncodingException: ");
e1.printStackTrace();
}
if (ba != null){
//debug //printBytes(ba, "analizze:");
analyze(analyzerId, ba);
}
return analyzeResult;
}
/**
* The C interface will call this method to return analysis results
*/
private void callback(byte[] ana) {
String s = null;
try {
// bpgergo 20090618 this was a bug
// s = new String(ana);
s = new String(ana, encoding);
} catch (UnsupportedEncodingException e) {
System.err.println("callback new String(ana, encoding) UnsupportedEncodingException:");
e.printStackTrace();
}
analyzeResult.add(s);
//if (s != null) {
//debug("!callback recieved: ");
// debug //printBytes(ana, s);
//} else {
//debug("callback s == null");
//}
}
/* static argument conversion methods */
private static int boolean2Code(boolean bool){
if (bool){
return 1;
} else {
return 0;
}
}
private static int compounds2Code(Compounds compounds){
int comp = 0;
switch (compounds) {
case No:
comp = 0;
break;
case Allow:
comp = 1;
break;
}
return comp;
}
private static int guessToCode(Guess guess){
int gu = 0;
switch (guess) {
case NoGuess:
gu = 0;
break;
case Fallback:
gu = 1;
break;
case Global:
gu = 2;
break;
}
return gu;
}
public String getEncoding() {
return encoding;
}
public long getAnalyzerId() {
return analyzerId;
}
/*private static void debug(String string) {
if (debug) {
System.out.println(string);
}
}*/
/* getter/setter methods */
public boolean isDebug() {
return false; //debug;
}
public void setDebug(boolean debug) {
//OcamorphWrapper.debug = debug;
}
/* static debug methods */
/*public static void printBytes(byte[] array, String name) {
if (debug) {
for (int k = 0; k < array.length; k++) {
debug(name + "[" + k + "] = " + "0x" + byteToHex(array[k]));
}
}
}*/
/*static public String byteToHex(byte b) {
// Returns hex String representation of byte b
char hexDigit[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'a', 'b', 'c', 'd', 'e', 'f' };
char[] array = { hexDigit[(b >> 4) & 0x0f], hexDigit[b & 0x0f] };
return new String(array);
}*/
/*static public String charToHex(char c) {
// Returns hex String representation of char c
byte hi = (byte) (c >>> 8);
byte lo = (byte) (c & 0xff);
return byteToHex(hi) + byteToHex(lo);
}*/
}