public class TrainOptions
extends java.lang.Object
implements java.io.Serializable
Modifier and Type | Class and Description |
---|---|
static class |
TrainOptions.TransformMatrixType |
Modifier and Type | Field and Description |
---|---|
boolean |
basicCategoryTagsInDependencyGrammar
Where to use the basic or split tags in the dependency grammar
|
int |
batchSize |
boolean |
cheatPCFG
Add all test set trees to training data for PCFG.
|
boolean |
collinsPunc
Promote/delete punctuation like Collins.
|
int |
compactGrammar
How to compact grammars as FSMs.
|
int |
debugOutputFrequency
If larger than 0, the parser may choose to output debug information
every X seconds, X iterations, or some other similar metric
|
static int |
DEFAULT_BATCH_SIZE
When training using batches of trees, such as in the DVParser,
how many trees to use in one batch
|
static double |
DEFAULT_DELTA_MARGIN |
static int |
DEFAULT_K_BEST
When training the DV parsing method, how many of the top K trees
to analyze from the underlying parser
|
static double |
DEFAULT_LEARNING_RATE |
static int |
DEFAULT_QN_ITERATIONS_PER_BATCH
When training the DV parsing method, how many iterations to loop
for one batch of trees
|
static double |
DEFAULT_REGCOST
regularization constant
|
static double |
DEFAULT_SCALING_FOR_INIT |
static int |
DEFAULT_STALLED_ITERATION_LIMIT |
static int |
DEFAULT_TRAINING_ITERATIONS
When training a parsing method where the training has a (max)
number of iterations, how many iterations to loop
|
static java.lang.String |
DEFAULT_UNK_WORD |
java.util.Set<java.lang.String> |
deleteSplitters |
double |
deltaMargin
How much to penalize the wrong trees for how different they are
from the gold tree when training
|
int |
dvKBest |
boolean |
dvSimplifiedModel
Make the dv model as simple as possible
|
double |
fractionBeforeUnseenCounting
Start to aggregate signature-tag pairs only for words unseen in the first
this fraction of the data.
|
boolean |
gPA
This variable controls doing 2 levels of parent annotation.
|
int |
HSEL_CUT |
boolean |
hSelSplit |
double |
learningRate
How fast to learn (can mean different things for different algorithms)
|
boolean |
leftRec
Left edge is right-recursive (X << X) Bad.
|
boolean |
leftToRight |
boolean |
lowercaseWordVectors
Whether or not to lowercase word vectors
|
boolean |
markFinalStates
Whether or not to mark final states in binarized grammar.
|
boolean |
markovFactor
Whether to do "horizontal Markovization" (as in ACL 2003 paper).
|
int |
markovOrder |
boolean |
markStrahler
Horton-Strahler number/dimension (Maximilian Schlund)
|
int |
markUnary
Mark all unary nodes specially.
|
boolean |
markUnaryTags
Mark POS tags which are the sole member of their phrasal constituent.
|
int |
maxTrainTimeSeconds |
boolean |
noRebinarization
When binarizing trees, don't binarize trees with two children.
|
boolean |
noTagSplit |
int |
openClassTypesThreshold
A POS tag has to have been attributed to more than this number of word
types before it is regarded as an open-class tag.
|
boolean |
PA
This variable controls doing parent annotation of phrasal nodes.
|
boolean |
postGPA |
boolean |
postPA |
java.util.Set |
postSplitters |
boolean |
postSplitWithBaseCategory
Whether, in post-splitting of categories, nodes are annotated with the
(grand)parent's base category or with its complete subcategorized
category.
|
boolean |
predictSplits
Use the method reported by Berkeley for splitting and recombining
states.
|
TreeTransformer |
preTransformer
A transformer to use on the training data before any other
processing step.
|
java.io.PrintWriter |
printAnnotatedPW |
boolean |
printAnnotatedRuleCounts |
boolean |
printAnnotatedStateCounts |
java.io.PrintWriter |
printBinarizedPW |
boolean |
printStates |
int |
printTreeTransformations
Just for debugging: check that your tree transforms work correctly.
|
int |
qnEstimates
When training the DV parsing method, how many estimates to keep
for the qn approximation.
|
int |
qnIterationsPerBatch |
double |
qnTolerance
When training the DV parsing method, the tolerance to use if we
want to stop qn early
|
long |
randomSeed |
double |
regCost |
boolean |
rightRec
Right edge is right-recursive (X << X) Bad.
|
double |
ruleDiscount
Discounts the count of BinaryRule's (only, apparently) in training data.
|
boolean |
ruleSmoothing
Enables linear rule smoothing during grammar extraction
but before grammar compaction.
|
double |
ruleSmoothingAlpha |
double |
scalingForInit
How much to scale certain parameters when initializing models.
|
boolean |
selectivePostSplit |
double |
selectivePostSplitCutOff |
boolean |
selectiveSplit
Only split the "common high KL divergence" parent categories....
|
double |
selectiveSplitCutOff |
boolean |
simpleBinarizedLabels
When binarizing trees, don't annotate the labels with anything
|
boolean |
sisterAnnotate
Selective Sister annotation.
|
java.util.Set<java.lang.String> |
sisterSplitters |
boolean |
smoothing
TODO wsg2011: This is the old grammar smoothing parameter that no
longer does anything in the parser.
|
int |
splitCount
If we are predicting splits, we loop this many times
|
boolean |
splitPrePreT
Mark all pre-preterminals (also does splitBaseNP: don't need both)
|
double |
splitRecombineRate
If we are predicting splits, we recombine states at this rate every loop
|
java.util.Set<java.lang.String> |
splitters
Set the splitter strings.
|
int |
stalledIterationLimit
How many iterations to allow training to stall before taking the
best model, if training in an iterative manner
|
java.lang.String |
taggedFiles
A set of files to use as extra information in the lexicon.
|
boolean |
tagPA
Parent annotation on tags.
|
boolean |
tagSelectivePostSplit |
double |
tagSelectivePostSplitCutOff |
boolean |
tagSelectiveSplit
Do parent annotation on tags selectively.
|
double |
tagSelectiveSplitCutOff |
int |
trainingIterations |
int |
trainingThreads
If the training algorithm allows for parallelization, how many
threads to use
|
int |
trainLengthLimit |
java.lang.String |
trainTreeFile |
boolean |
trainWordVectors
Do we want a model that uses word vectors (such as the DVParser)
to train those word vectors when training the model?
Note: models prior to 2014-02-13 may have incorrect values in this field, as it was originally a compile time constant |
TrainOptions.TransformMatrixType |
transformMatrixType |
boolean |
unknownCapsVector
Whether or not to build an unknown word vector for words with caps in them
|
boolean |
unknownChineseNumberVector
Whether or not to build an unknown word vector to match Chinese numbers
|
boolean |
unknownChinesePercentVector
Whether or not to build an unknown word vector to match Chinese percentages
|
boolean |
unknownChineseYearVector
Whether or not to build an unknown word vector to match Chinese years
|
boolean |
unknownDashedWordVectors
Whether or not to handle unknown dashed words by taking the last part
|
boolean |
unknownNumberVector
Whether or not to build an unknown word vector specifically for numbers
|
java.lang.String |
unkWord
Some models will use external data sources which contain
information about unknown words.
|
boolean |
useContextWords
Specifically for the DVModel, uses words on either side of a
context when combining constituents.
|
Constructor and Description |
---|
TrainOptions() |
Modifier and Type | Method and Description |
---|---|
int |
compactGrammar() |
void |
display() |
boolean |
outsideFactor()
If true, declare early -- leave this on except maybe with markov on.
|
static void |
printTrainTree(java.io.PrintWriter pw,
java.lang.String message,
Tree t) |
java.lang.String |
toString() |
public java.lang.String trainTreeFile
public int trainLengthLimit
public boolean cheatPCFG
public boolean markovFactor
public int markovOrder
public boolean hSelSplit
public int HSEL_CUT
public boolean markFinalStates
public int openClassTypesThreshold
public double fractionBeforeUnseenCounting
public boolean PA
public boolean gPA
public boolean postPA
public boolean postGPA
public boolean selectiveSplit
public double selectiveSplitCutOff
public boolean selectivePostSplit
public double selectivePostSplitCutOff
public boolean postSplitWithBaseCategory
public boolean sisterAnnotate
public java.util.Set<java.lang.String> sisterSplitters
public int markUnary
public boolean markUnaryTags
public boolean splitPrePreT
public boolean tagPA
public boolean tagSelectiveSplit
public double tagSelectiveSplitCutOff
public boolean tagSelectivePostSplit
public double tagSelectivePostSplitCutOff
public boolean rightRec
public boolean leftRec
public boolean collinsPunc
public java.util.Set<java.lang.String> splitters
public java.util.Set postSplitters
public java.util.Set<java.lang.String> deleteSplitters
public int printTreeTransformations
public java.io.PrintWriter printAnnotatedPW
public java.io.PrintWriter printBinarizedPW
public boolean printStates
public int compactGrammar
public boolean leftToRight
public boolean noTagSplit
public boolean ruleSmoothing
public double ruleSmoothingAlpha
public boolean smoothing
public double ruleDiscount
public boolean printAnnotatedRuleCounts
public boolean printAnnotatedStateCounts
public boolean basicCategoryTagsInDependencyGrammar
public TreeTransformer preTransformer
public java.lang.String taggedFiles
public boolean predictSplits
public int splitCount
public double splitRecombineRate
public boolean simpleBinarizedLabels
public boolean noRebinarization
public int trainingThreads
public static final int DEFAULT_K_BEST
public int dvKBest
public static final int DEFAULT_TRAINING_ITERATIONS
public int trainingIterations
public static final int DEFAULT_BATCH_SIZE
public int batchSize
public static final double DEFAULT_REGCOST
public double regCost
public static final int DEFAULT_QN_ITERATIONS_PER_BATCH
public int qnIterationsPerBatch
public int qnEstimates
public double qnTolerance
public int debugOutputFrequency
public long randomSeed
public static final double DEFAULT_LEARNING_RATE
public double learningRate
public static final double DEFAULT_DELTA_MARGIN
public double deltaMargin
public boolean unknownNumberVector
public boolean unknownDashedWordVectors
public boolean unknownCapsVector
public boolean dvSimplifiedModel
public boolean unknownChineseYearVector
public boolean unknownChineseNumberVector
public boolean unknownChinesePercentVector
public static final double DEFAULT_SCALING_FOR_INIT
public double scalingForInit
public int maxTrainTimeSeconds
public static final java.lang.String DEFAULT_UNK_WORD
public java.lang.String unkWord
public boolean lowercaseWordVectors
public TrainOptions.TransformMatrixType transformMatrixType
public boolean useContextWords
public boolean trainWordVectors
public static final int DEFAULT_STALLED_ITERATION_LIMIT
public int stalledIterationLimit
public boolean markStrahler
public boolean outsideFactor()
public int compactGrammar()
public void display()
public java.lang.String toString()
toString
in class java.lang.Object
public static void printTrainTree(java.io.PrintWriter pw, java.lang.String message, Tree t)