views:

466

answers:

2

Hi, there is this morphological analyzer (open source, written in OCml) named ocamorph. download and make instructions here

The java binding is buggy and I'll have to fix it and after a few hours of struggle now it seems to me it'll take a few days to fix it as I'm not familiar with C, JNI, OCml and this particular software.

Here you can see that for a small file (subtitles_136.hu.tok) it works but for a larger file (Tolkien_1.hu.tok) "Segmentation error" is thrown:

bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ java -Djava.library.path=./output/ -cp output mokk.nlp.ocamorph.FileStemmer $HULEXICON src/java/mokk/nlp/ocamorph/cache2.txt > src/java/mokk/nlp/ocamorph/subtitles_136.hu.stem < src/java/mokk/nlp/ocamorph/subtitles_136.hu.tok
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ java -Djava.library.path=./output/ -cp output mokk.nlp.ocamorph.FileStemmer $HULEXICON src/java/mokk/nlp/ocamorph/cache.txt > src/java/mokk/nlp/ocamorph/Tolkien_1.en.stem < src/java/mokk/nlp/ocamorph/Tolkien_1.en.tok
Segmentation fault
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ ls -l src/java/mokk/nlp/ocamorph/
total 2116
-rw-rw-r-- 1 bpgergo breka    8505 2009-09-22 13:53 cache2.txt
-rw-rw-r-- 1 bpgergo breka      65 2009-07-07 18:48 Compounds.java
drwxrwxr-x 2 bpgergo breka    4096 2009-09-22 13:54 CVS
-rw-rw-r-- 1 bpgergo breka    5888 2009-09-18 17:19 FileStemmer.java
-rw-rw-r-- 1 bpgergo breka      77 2009-07-07 18:48 Guess.java
-rw-rw-r-- 1 bpgergo breka     953 2009-08-31 18:58 IOcamorphStemmer.java
-rw-rw-r-- 1 bpgergo breka    5419 2009-08-31 18:58 OcamorphCachedStemmer.java
-rw-rw-r-- 1 bpgergo breka    2836 2009-08-03 16:00 OcamorphStemmer.java
-rw-rw-r-- 1 bpgergo breka    4612 2009-09-22 12:51 OcamorphWrapper.java
-rw-rw-r-- 1 bpgergo breka    6731 2009-09-22 13:53 subtitles_136.hu.stem
-rw-rw-r-- 1 bpgergo breka    7356 2009-09-20 21:12 subtitles_136.hu.tok
-rw-rw-r-- 1 bpgergo breka    2907 2009-09-18 17:22 Tester.java
-rw-rw-r-- 1 bpgergo breka       0 2009-09-22 13:53 Tolkien_1.en.stem
-rw-rw-r-- 1 bpgergo breka 1033059 2009-09-17 16:09 Tolkien_1.en.tok
-rw-rw-r-- 1 bpgergo breka       0 2009-09-22 13:14 Tolkien_1.hu.stem
-rw-rw-r-- 1 bpgergo breka 1041968 2009-09-17 16:09 Tolkien_1.hu.tok
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $

This is the C part of the Java binding (/ocamorph/src/bindings/java/src/c/hunmorph_jnistub.c). This might be the buggy part, thanks for any hint or help for finding the bug:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mokk_nlp_ocamorph_OcamorphWrapper.h"

#include "ocamorph.h"
#define MAX_ANALYSIS 100
#define ANALYSIS_MAXLEN 100

 // initialize the analysis string
  char analysis[ANALYSIS_MAXLEN];
  // initialize input buffer
  char buffer[500];
  char* analyses[MAX_ANALYSIS];

jmethodID MID_InstanceMethodCall_callback;


JNIEXPORT void JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_initIDs
  (JNIEnv *env, jclass cls) {

  MID_InstanceMethodCall_callback =
         (*env)->GetMethodID(env, cls, "callback", "([B)V");

}
JNIEXPORT jlong JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_init
  (JNIEnv * env, jobject obj, jstring bin_arg) {

  /* Convert to UTF8 */
  const char *bin_file  = (*env)->GetStringUTFChars(env, bin_arg, JNI_FALSE);

  ocamorph_startup();
  ocamorph_engine engine = init_from_bin(bin_file,0/*Don't pass the stupid no_caps argument*/);

  /* Release created UTF8 string */
  (*env)->ReleaseStringUTFChars(env, bin_arg, bin_file);

  int i;
  for (i=0; i<MAX_ANALYSIS;i++) {
    analyses[i] = (char *) malloc(ANALYSIS_MAXLEN * sizeof(char));
  };

  return  (jlong) engine;

}

JNIEXPORT jlong JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_make_1analyzer
  (JNIEnv *env, jobject obj, jlong engine , jint blocking, jint compunds, jint stop_at_first, jint guess) {

  ocamorph_engine analyzer = make_analyzer((ocamorph_engine) engine, blocking, compunds, stop_at_first, guess);

  return (jlong) analyzer;

}

JNIEXPORT void JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_analyze
  (JNIEnv * env, jobject obj, jlong analyzer, jbyteArray word) {

  ocamorph_engine analyzerc = (ocamorph_engine) analyzer;

  /* Convert to UTF8 */
  // const char *wordc  = (*env)->GetStringUTFChars(env, word, JNI_FALSE);

  //char *wordc =  (char *) (*env)->GetByteArrayElements( env, word, 0);

  const int maxInputLength = 1000;
  char wordc[maxInputLength];
  jsize len = (*env)->GetArrayLength(env,word);
  if (len>=maxInputLength) { len = maxInputLength-1; }

  if (len!=0)
  {
    (*env)->GetByteArrayRegion(env,word,0,len,(jbyte*)wordc);
  }
  wordc[len] = '\0';

  int n = analyze(analyzerc,wordc,analyses,MAX_ANALYSIS, ANALYSIS_MAXLEN);

  int i;
    for (i=0; i < n; ++i) {
      //  jstring ana = (*env)->NewStringUTF(env, analyses[i]);
      char* ana = analyses[i];
     jbyteArray jb=(*env)->NewByteArray(env, strlen(ana));
  (*env)->SetByteArrayRegion(env, jb, 0, strlen(ana), (jbyte *)ana);
     (*env)->CallVoidMethod(env, obj, MID_InstanceMethodCall_callback, jb);

     }
//  (*env)->ReleaseStringUTFChars(env, word, wordc);
}

And here is the Java part (/ocamorph/src/bindings/java/src/java/mokk/nlp/ocamorph/OcamorphWrapper.java):

package mokk.nlp.ocamorph;

import java.io.UnsupportedEncodingException;
import java.util.LinkedList;
import java.util.List;

/**
 * JNI interface for Ocamorph. Constructor loads ocamorph engine and a specified binary resource.
 * 
 * @author bpgergo
 *
 */
public class OcamorphWrapper {

 private long analyzerId;
 private long engineId;


 private native static void initIDs();

 private native long init(String bin);

 // const ocamorph_engine engine, const int blocking, const int compounds,
 // const int stop_at_first, const int guess
 // valami hiba van az ocamorph-ban, mert a stop_at_first vezerli az
 // osszetettszosagot
 private native long make_analyzer(long engine, int blocking, int compounds,
   int stop_at_first, int guess);

 private native void analyze(long analyzer, byte[] word);

 static {
  //TODO FIXME how to define the library dynamically?
  System.loadLibrary("ocamorph");
  initIDs();
 }

 /**
  * the encoding required by the ocamorph lib
  */
 private static String encoding = "ISO-8859-2";

 //private static boolean debug = false;

 /**
  * analyze result (the callback will add the result strings)
  */
 private List<String> analyzeResult = null;

 /**
  * Loads a new Ocamorph engine, using the given binary resource and the arguments.
  *  
  * @param bin
  * @param blocking
  * @param stopAtFirst
  * @param compounds
  * @param guess
  */
 public OcamorphWrapper(String bin, boolean blocking, boolean stopAtFirst, 
   Compounds compounds, Guess guess) {
  super();
  engineId = init(bin);
  int comp = compounds2Code(compounds);
  int gu = guessToCode(guess);
  analyzerId = make_analyzer(engineId, boolean2Code(blocking), boolean2Code(stopAtFirst),
    comp, gu);
  //debug("engineId:"+engineId);
  //debug("analyzerId:"+analyzerId);
  //debug = false;
 }


 /**
  * This is the interface method for ocamorph analysis for the java side.
  * @param ba
  */
 public List<String> analyze(String word) {
  //debug("analyze:");
  analyzeResult = new LinkedList<String>();
  byte[] ba = null;
  try {
   ba = word.getBytes(encoding);
  } catch (UnsupportedEncodingException e1) {
   System.err
     .println("Ocamorph analyze UnsupportedEncodingException: ");
   e1.printStackTrace();
  }
  if (ba != null){
   //debug //printBytes(ba, "analizze:");
   analyze(analyzerId, ba);
  }
  return analyzeResult;
 }

 /**
  * The C interface will call this method to return analysis results
  */
 private void callback(byte[] ana) {

  String s = null;
  try {
   // bpgergo 20090618 this was a bug
   // s = new String(ana);
   s = new String(ana, encoding);
  } catch (UnsupportedEncodingException e) {
   System.err.println("callback new String(ana, encoding) UnsupportedEncodingException:");
   e.printStackTrace();
  }

  analyzeResult.add(s);

  //if (s != null) {
   //debug("!callback recieved: ");
   // debug //printBytes(ana, s);
  //} else {
   //debug("callback s == null");
  //}
 }

 /* static argument conversion methods */

 private static int boolean2Code(boolean bool){
  if (bool){
   return 1; 
  } else {
   return 0;
  }

 }
 private static int compounds2Code(Compounds compounds){
  int comp = 0;
  switch (compounds) {
  case No:
   comp = 0;
   break;
  case Allow:
   comp = 1;
   break;
  }
  return comp;
 }

 private static int guessToCode(Guess guess){
  int gu = 0;
  switch (guess) {
  case NoGuess:
   gu = 0;
   break;
  case Fallback:
   gu = 1;
   break;
  case Global:
   gu = 2;
   break;
  }
  return gu;
 }

 public String getEncoding() {
  return encoding;
 }
 public long getAnalyzerId() {
  return analyzerId;
 }


 /*private static void debug(String string) {
  if (debug) {
   System.out.println(string);
  }
 }*/

 /* getter/setter methods */

 public boolean isDebug() {
  return false; //debug;
 }

 public void setDebug(boolean debug) {
  //OcamorphWrapper.debug = debug;
 }

 /* static debug methods */

 /*public static void printBytes(byte[] array, String name) {
  if (debug) {
   for (int k = 0; k < array.length; k++) {
    debug(name + "[" + k + "] = " + "0x" + byteToHex(array[k]));
   }
  }
 }*/

 /*static public String byteToHex(byte b) {
  // Returns hex String representation of byte b
  char hexDigit[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    'a', 'b', 'c', 'd', 'e', 'f' };
  char[] array = { hexDigit[(b >> 4) & 0x0f], hexDigit[b & 0x0f] };
  return new String(array);
 }*/

 /*static public String charToHex(char c) {
  // Returns hex String representation of char c
  byte hi = (byte) (c >>> 8);
  byte lo = (byte) (c & 0xff);
  return byteToHex(hi) + byteToHex(lo);
 }*/



}
A: 

Try building the C code with debug information, and look up how to enable core dumps on your (seemingly Unix-like) operating system. That should give you a starting point.

unwind
+1  A: 

Is any sort of hs_pid###.log file being created when the system crashes? They can occasionally help in figuring out these problems.

My guess is that it has something to do with the wacky way that the MID_InstanceMethodCall_callback method id is being set. The id is stored as a global value, and it only gets set if the initIDs static method is called, which doesn't appear to happen in your sample code. If it's not set, then analyse will barf when it tries to call the callback method. A way to ensure that you get the callback method id would be the following:

jclass cls = (*env)->GetObjectClass(env, obj);
if(cls == NULL){
  //Handle any errors
}
jmethodID mid = (*env)->GetMethodID(env, cls, "callback", "([B)V");
if(mid == NULL){
  //Handle any more errors
}
int i;
for (i=0; i < n; ++i) {
  //  jstring ana = (*env)->NewStringUTF(env, analyses[i]);
  char* ana = analyses[i];
  jbyteArray jb=(*env)->NewByteArray(env, strlen(ana));
  (*env)->SetByteArrayRegion(env, jb, 0, strlen(ana), (jbyte *)ana);
  (*env)->CallVoidMethod(env, obj, mid, jb);

}
TwentyMiles