public class IOBUtils
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
static java.lang.String |
BeginSymbol |
static java.lang.String |
ContinuationSymbol |
static java.lang.String |
NosegSymbol |
static java.lang.String |
RewriteSymbol |
static java.lang.String |
RewriteTahSymbol
Deprecated.
use RewriteSymbol instead
|
static java.lang.String |
RewriteTareefSymbol
Deprecated.
use RewriteSymbol instead
|
Modifier and Type | Method and Description |
---|---|
static java.lang.String |
getBoundaryCharacter() |
static java.lang.String |
IOBToString(java.util.List<CoreLabel> labeledSequence)
Convert a list of labeled characters to a String.
|
static java.lang.String |
IOBToString(java.util.List<CoreLabel> labeledSequence,
java.lang.String segmentationMarker)
Convert a list of labeled characters to a String.
|
static java.lang.String |
IOBToString(java.util.List<CoreLabel> labeledSequence,
java.lang.String prefixMarker,
java.lang.String suffixMarker)
Convert a list of labeled characters to a String.
|
static java.lang.String |
IOBToString(java.util.List<CoreLabel> labeledSequence,
java.lang.String prefixMarker,
java.lang.String suffixMarker,
int startIndex,
int endIndex)
Convert a list of labeled characters to a String.
|
static void |
labelDomain(java.util.List<CoreLabel> tokenList,
java.lang.String domain) |
static java.util.List<CoreLabel> |
StringToIOB(java.util.List<CoreLabel> tokenList,
java.lang.Character segMarker,
boolean applyRewriteRules)
Convert a String to a list of characters suitable for labeling in an IOB
segmentation model.
|
static java.util.List<CoreLabel> |
StringToIOB(java.util.List<CoreLabel> tokenList,
java.lang.Character segMarker,
boolean applyRewriteRules,
boolean stripRewrites)
Convert a String to a list of characters suitable for labeling in an IOB
segmentation model.
|
static java.util.List<CoreLabel> |
StringToIOB(java.util.List<CoreLabel> tokenList,
java.lang.Character segMarker,
boolean applyRewriteRules,
boolean stripRewrites,
TokenizerFactory<CoreLabel> tf,
java.lang.String origText)
Convert a String to a list of characters suitable for labeling in an IOB
segmentation model.
|
static java.util.List<CoreLabel> |
StringToIOB(java.util.List<CoreLabel> tokenList,
java.lang.Character segMarker,
boolean applyRewriteRules,
TokenizerFactory<CoreLabel> tf,
java.lang.String origText)
Convert a String to a list of characters suitable for labeling in an IOB
segmentation model.
|
static java.util.List<CoreLabel> |
StringToIOB(java.lang.String string)
This version is for turning an unsegmented string to an IOB input, i.e.,
for processing raw text.
|
static java.util.List<CoreLabel> |
StringToIOB(java.lang.String str,
java.lang.Character segMarker) |
static java.util.List<IntPair> |
TokenSpansForIOB(java.util.List<CoreLabel> labeledSequence) |
public static final java.lang.String BeginSymbol
public static final java.lang.String ContinuationSymbol
public static final java.lang.String NosegSymbol
public static final java.lang.String RewriteSymbol
public static final java.lang.String RewriteTahSymbol
public static final java.lang.String RewriteTareefSymbol
public static java.lang.String getBoundaryCharacter()
public static java.util.List<CoreLabel> StringToIOB(java.util.List<CoreLabel> tokenList, java.lang.Character segMarker, boolean applyRewriteRules)
tokenList
- segMarker
- applyRewriteRules
- add rewrite labels (for training data)public static java.util.List<CoreLabel> StringToIOB(java.util.List<CoreLabel> tokenList, java.lang.Character segMarker, boolean applyRewriteRules, TokenizerFactory<CoreLabel> tf, java.lang.String origText)
tokenList
- segMarker
- applyRewriteRules
- add rewrite labels (for training data)tf
- a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)origText
- the original string before tokenization (for determining original segment boundaries)public static java.util.List<CoreLabel> StringToIOB(java.util.List<CoreLabel> tokenList, java.lang.Character segMarker, boolean applyRewriteRules, boolean stripRewrites)
tokenList
- segMarker
- applyRewriteRules
- add rewrite labels (for training data)stripRewrites
- revert training data to old Green and DeNero model (remove
rewrite labels but still rewrite to try to preserve raw text)public static java.util.List<CoreLabel> StringToIOB(java.util.List<CoreLabel> tokenList, java.lang.Character segMarker, boolean applyRewriteRules, boolean stripRewrites, TokenizerFactory<CoreLabel> tf, java.lang.String origText)
tokenList
- segMarker
- applyRewriteRules
- add rewrite labels (for training data)stripRewrites
- revert training data to old Green and DeNero model (remove
rewrite labels but still rewrite to try to preserve raw text)tf
- a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)origText
- the original string before tokenization (for determining original segment boundaries)public static java.util.List<CoreLabel> StringToIOB(java.lang.String string)
public static java.util.List<CoreLabel> StringToIOB(java.lang.String str, java.lang.Character segMarker)
public static java.lang.String IOBToString(java.util.List<CoreLabel> labeledSequence, java.lang.String prefixMarker, java.lang.String suffixMarker)
public static java.lang.String IOBToString(java.util.List<CoreLabel> labeledSequence, java.lang.String prefixMarker, java.lang.String suffixMarker, int startIndex, int endIndex)
public static java.lang.String IOBToString(java.util.List<CoreLabel> labeledSequence, java.lang.String segmentationMarker)
public static java.lang.String IOBToString(java.util.List<CoreLabel> labeledSequence)
public static void labelDomain(java.util.List<CoreLabel> tokenList, java.lang.String domain)