Jython:Preprocessing
From Mallet
Preprocessing with Mallet pipes
Mallet pipes can add features to tokens. Here we will use mallet's pipes to add prefix and suffix features to a file for POS tagging and save the result in a different file. Alternatively by replacing the last part of the script with the end of the script from the training example we could train and run a structured classifier with the new features instead of writing the features to a file.
This is actually really simple. As before we create a pipe and add the data to it:
p = List2Pipe(( SimpleTaggerSentence2TokenSequence(),
TokenTextCharSuffix("S1-", 1),
TokenTextCharSuffix("S2-", 2),
TokenTextCharPrefix("P1-", 1),
TokenTextCharPrefix("P2-", 2),
TokenSequence2FeatureVectorSequence()),
defaultLabel)
p.setTargetProcessing(1)
instList = LineGroupInstanceList(p, inFile)
next we open a file for writing:
try:
out=open(outFile,"w")
except IOError:
print "There was an error writing to", outFile
sys.exit()
Finally we print the instance list to the file and close it.
alphabet = instList.getDataAlphabet()
for i in range(0,instList.size()):
instance = instList.getInstance(i)
input = instance.getData()
labels = instance.target
for j in range(0,input.size()):
lab = labels.getLabelAtPosition(j)
fv = input.get(j)
vals = fv.getIndices()
for k in range(0,len(vals)):
out.write(alphabet.lookupObject(vals[k]))
out.write(" ")
out.write(lab.toString())
out.write("\n")
out.write("\n")
out.close()
Putting it all together and adding some import statements, we get:
""" import statements. """
from mallet.crfs import *
from edu.umass.cs.mallet.base.fst import SimpleTaggerSentence2TokenSequence
from edu.umass.cs.mallet.base.pipe import TokenSequence2FeatureVectorSequence
from edu.umass.cs.mallet.base.pipe.tsf import *
import jarray
import sys
""" define some variables so we don't have to search to change them """
defaultLabel = "O"
inFile = "pipe-in.txt"
outFile = "pipe-out.txt"
"""
Create a pipe to read in the training data, add the default feature to that
pipe's alphabet, and tell the pipe to expect labels on the input
"""
p = List2Pipe(( SimpleTaggerSentence2TokenSequence(),
TokenTextCharSuffix("S1-", 1),
TokenTextCharSuffix("S2-", 2),
TokenTextCharPrefix("P1-", 1),
TokenTextCharPrefix("P2-", 2),
TokenSequence2FeatureVectorSequence()),
defaultLabel)
p.setTargetProcessing(1)
instList = LineGroupInstanceList(p, inFile)
try:
out=open(outFile,"w")
except IOError:
print "There was an error writing to", outFile
sys.exit()
alphabet = instList.getDataAlphabet()
for i in range(0,instList.size()):
instance = instList.getInstance(i)
input = instance.getData()
labels = instance.target
for j in range(0,input.size()):
lab = labels.getLabelAtPosition(j)
fv = input.get(j)
vals = fv.getIndices()
for k in range(0,len(vals)):
out.write(alphabet.lookupObject(vals[k]))
out.write(" ")
out.write(lab.toString())
out.write("\n")
out.write("\n")
out.close()
This will read a file containing the lines:
Confidence NN
in IN
the DT
pound NN
To produce a file containing the lines:
P2-Co P1-C S2-ce S1-e Confidence NN
P1-i S1-n in IN
S1-e P2-th P1-t S2-he the DT
P2-po P1-p S2-nd S1-d pound NN
P1 is a one-character prefix feature, P2 a two-character and sor on.