public class ChineseCharacterBasedLexicon extends java.lang.Object implements Lexicon
BOUNDARY, BOUNDARY_TAG, UNKNOWN_WORD
Constructor and Description |
---|
ChineseCharacterBasedLexicon(ChineseTreebankParserParams params,
Index<java.lang.String> wordIndex,
Index<java.lang.String> tagIndex) |
Modifier and Type | Method and Description |
---|---|
void |
finishTraining()
Done collecting statistics for the lexicon.
|
Distribution<java.lang.String> |
getPOSDistribution() |
UnknownWordModel |
getUnknownWordModel() |
void |
incrementTreesRead(double weight)
If training on a per-word basis instead of on a per-tree basis,
we will want to increment the tree count as this happens.
|
void |
initializeTraining(double numTrees)
Start training this lexicon on the expected number of trees.
|
static boolean |
isForeign(java.lang.String s) |
boolean |
isKnown(int word)
Checks whether a word is in the lexicon.
|
boolean |
isKnown(java.lang.String word)
Checks whether a word is in the lexicon.
|
int |
numRules()
Returns the number of rules (tag rewrites as word) in the Lexicon.
|
void |
readData(java.io.BufferedReader in)
Read the lexicon from the BufferedReader in the format written by
writeData.
|
java.util.Iterator<IntTaggedWord> |
ruleIteratorByWord(int word,
int loc,
java.lang.String featureSpec)
Get an iterator over all rules (pairs of (word, POS)) for this word.
|
java.util.Iterator<IntTaggedWord> |
ruleIteratorByWord(java.lang.String word,
int loc,
java.lang.String featureSpec)
Same thing, but with a string that needs to be translated by the
lexicon's word index
|
java.lang.String |
sampleFrom()
Samples over words regardless of POS: first samples POS, then samples
word according to that POS
|
java.lang.String |
sampleFrom(java.lang.String tag)
Samples from the distribution over words with this POS according to the lexicon.
|
float |
score(IntTaggedWord iTW,
int loc,
java.lang.String word,
java.lang.String featureSpec)
Get the score of this word with this tag (as an IntTaggedWord) at this
loc.
|
void |
setUnknownWordModel(UnknownWordModel uwm) |
java.util.Set<java.lang.String> |
tagSet(java.util.function.Function<java.lang.String,java.lang.String> basicCategoryFunction)
Return the Set of tags used by this tagger (available after training the tagger).
|
void |
train(java.util.Collection<Tree> trees)
Train this lexicon on the given set of trees.
|
void |
train(java.util.Collection<Tree> trees,
java.util.Collection<Tree> rawTrees) |
void |
train(java.util.Collection<Tree> trees,
double weight)
Train this lexicon on the given set of trees.
|
void |
train(java.util.List<TaggedWord> sentence,
double weight)
Not all subclasses support this particular method.
|
void |
train(TaggedWord tw,
int loc,
double weight)
Not all subclasses support this particular method.
|
void |
train(Tree tree,
double weight)
TODO: make this method do something with the weight
|
void |
trainUnannotated(java.util.List<TaggedWord> sentence,
double weight)
Sometimes we might have a sentence of tagged words which we would
like to add to the lexicon, but they weren't part of a binarized,
markovized, or otherwise annotated tree.
|
void |
writeData(java.io.Writer w)
Write the lexicon in human-readable format to the Writer.
|
public ChineseCharacterBasedLexicon(ChineseTreebankParserParams params, Index<java.lang.String> wordIndex, Index<java.lang.String> tagIndex)
public void initializeTraining(double numTrees)
Lexicon
initializeTraining
in interface Lexicon
public void train(java.util.Collection<Tree> trees)
public void train(java.util.Collection<Tree> trees, double weight)
public void train(Tree tree, double weight)
public void trainUnannotated(java.util.List<TaggedWord> sentence, double weight)
Lexicon
trainUnannotated
in interface Lexicon
public void incrementTreesRead(double weight)
Lexicon
incrementTreesRead
in interface Lexicon
public void train(TaggedWord tw, int loc, double weight)
Lexicon
public void train(java.util.List<TaggedWord> sentence, double weight)
Lexicon
public void finishTraining()
Lexicon
finishTraining
in interface Lexicon
public Distribution<java.lang.String> getPOSDistribution()
public static boolean isForeign(java.lang.String s)
public float score(IntTaggedWord iTW, int loc, java.lang.String word, java.lang.String featureSpec)
Lexicon
score
in interface Lexicon
iTW
- An IntTaggedWord pairing a word and POS tagloc
- The position in the sentence. In the default implementation
this is used only for unknown words to change their
probability distribution when sentence initial.word
- The word itself; useful so we don't have to look it
up in an indexfeatureSpec
- TODOpublic java.lang.String sampleFrom(java.lang.String tag)
tag
- the POS of the word to samplepublic java.lang.String sampleFrom()
public java.util.Iterator<IntTaggedWord> ruleIteratorByWord(int word, int loc, java.lang.String featureSpec)
Lexicon
ruleIteratorByWord
in interface Lexicon
word
- The word, represented as an integer in Indexloc
- The position of the word in the sentence (counting from 0).
Implementation note: The BaseLexicon class doesn't
actually make use of this position information.featureSpec
- Additional word features like morphosyntactic information.tag -> word
rule.)public java.util.Iterator<IntTaggedWord> ruleIteratorByWord(java.lang.String word, int loc, java.lang.String featureSpec)
Lexicon
ruleIteratorByWord
in interface Lexicon
public int numRules()
public void readData(java.io.BufferedReader in) throws java.io.IOException
Lexicon
public void writeData(java.io.Writer w) throws java.io.IOException
Lexicon
public boolean isKnown(int word)
Lexicon
public boolean isKnown(java.lang.String word)
Lexicon
public java.util.Set<java.lang.String> tagSet(java.util.function.Function<java.lang.String,java.lang.String> basicCategoryFunction)
public UnknownWordModel getUnknownWordModel()
getUnknownWordModel
in interface Lexicon
public void setUnknownWordModel(UnknownWordModel uwm)
setUnknownWordModel
in interface Lexicon