package rog; import java.util.*; import java.io.*; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.process.*; /** A lexer for Charniak input sentences * @author Roger Levy */ %% %{ static final Word SENTENCE_BOUNDARY = new Word("SENTENCE_BOUNDARY"); public Object getNext() { try { Object o = yylex(); return o; } catch(IOException e) { return null; } } public static void main(String[] args) throws IOException { Reader r = new FileReader(args[0]); Tokenizer t = new CharniakTokenizer(r); while(t.hasNext()) { System.out.println(t.next()); } } %} %class CharniakTokenizer %implements Tokenizer %extends AbstractTokenizer %unicode %type Object %eofval{ return null; %eofval} %state SENTENCE SentenceLetter = s BeginSentence = <{SentenceLetter}> EndSentence = <\/{SentenceLetter}> WhiteSpace = [ \t\r\n\f] TokenCharacter = [^ \t\r\n\f] Token = {TokenCharacter}+ NonDelimiterToken = !((!(asdf))|({Token}{EndSentence})) %% { {BeginSentence} / .* { //System.err.println("trans:" + yytext()); yybegin(SENTENCE); return yylex();} {WhiteSpace} { //System.err.println("space:" + yytext()); return yylex(); } . { //System.err.println("ignore:" + yytext()); return yylex(); } } { {EndSentence} / .* { //System.err.println("endSentence:" + yytext()); yybegin(YYINITIAL); return SENTENCE_BOUNDARY; } {NonDelimiterToken} / {EndSentence} { return new Word(yytext()); } {WhiteSpace} { //System.err.println("space:" + yytext()); return yylex(); } . { //System.err.println("error:" + yytext()); return yylex(); } }