In [None]:
import nltk
nltk.download('ieer')

[nltk_data] Downloading package ieer to /root/nltk_data...
[nltk_data]   Package ieer is already up-to-date!


True

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [None]:
import nltk
from nltk.corpus import ieer

def ieertree2conlltags(tree, tagger=nltk.pos_tag):
    """
    Converts a chunked IEER tree into a list of (word, POS, IOB-tag) triplets.
    """
    # Extract (word, entity_label) pairs
    words, labels = zip(*tree.pos())
    iob_tags = []
    previous_label = None

    for label in labels:
        if label == tree.label():  # Not part of a named entity
            iob_tags.append("O")
            previous_label = None
        elif label == previous_label:
            iob_tags.append("I-" + label)
        else:
            iob_tags.append("B-" + label)
            previous_label = label

    # Apply POS tagging
    words, pos_tags = zip(*tagger(words))
    return list(zip(words, pos_tags, iob_tags))


In [None]:
from nltk.chunk.util import conlltags2tree

def ieer_chunked_sents(tagger=nltk.pos_tag):
    """
    Yields chunked trees from the IEER corpus in (word, POS, IOB) format.
    """
    for doc in ieer.parsed_docs():
        iob_triplets = ieertree2conlltags(doc.text, tagger)
        yield conlltags2tree(iob_triplets)


In [None]:
from nltk.chunk import ChunkParserI
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.classify import NaiveBayesClassifier

class ClassifierChunker(ChunkParserI):
    def __init__(self, train_sents, feature_extractor=None):
        if feature_extractor is None:
            feature_extractor = self._default_features

        train_set = []
        for tree in train_sents:
            iob_seq = tree2conlltags(tree)
            history = []
            for i, (word, pos, chunk_tag) in enumerate(iob_seq):
                features = feature_extractor(iob_seq, i, history)
                train_set.append((features, chunk_tag))
                history.append(chunk_tag)

        self.feature_extractor = feature_extractor
        self.classifier = NaiveBayesClassifier.train(train_set)

    def parse(self, tagged_sentence):
        history = []
        iob_output = []

        for i, (word, pos) in enumerate(tagged_sentence):
            features = self.feature_extractor(tagged_sentence, i, history)
            tag = self.classifier.classify(features)
            iob_output.append((word, pos, tag))
            history.append(tag)

        return conlltags2tree(iob_output)

    def _default_features(self, sent, i, history):
        word, pos = sent[i][0], sent[i][1]
        prev_pos = sent[i - 1][1] if i > 0 else "<START>"
        prev_tag = history[i - 1] if i > 0 else "<START>"
        next_pos = sent[i + 1][1] if i < len(sent) - 1 else "<END>"

        return {
            "word": word,
            "pos": pos,
            "prev_pos": prev_pos,
            "next_pos": next_pos,
            "prev_tag": prev_tag,
            "pos+word": f"{pos}+{word}"
        }


In [None]:
from nltk.corpus import treebank_chunk

# Load chunked examples from IEER
ieer_chunks = list(ieer_chunked_sents())
print("Length of ieer_chunks :", len(ieer_chunks))

# Train-test split
train_data = ieer_chunks[:80]
test_data = ieer_chunks[80:]

# Train the classifier chunker
chunker = ClassifierChunker(train_data)

# Use a test sentence from Treebank
sample_sentence = treebank_chunk.tagged_sents()[0]
parsed_output = chunker.parse(sample_sentence)

print("\nparsing :\n", parsed_output)


Length of ieer_chunks : 94

parsing :
 (S
  (LOCATION Pierre/NNP Vinken/NNP)
  ,/,
  (CARDINAL 61/CD)
  (DURATION years/NNS)
  (MEASURE old/JJ)
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  (MEASURE nonexecutive/JJ)
  director/NN
  (DATE Nov./NNP 29/CD)
  ./.)


In [None]:


# Evaluation
results = chunker.evaluate(test_data)
print("\nAccuracy :", results.accuracy())
print("Precision:", results.precision())
print("Recall   :", results.recall())


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  results = chunker.evaluate(test_data)



Accuracy : 0.859482206568475
Precision: 0.34391143911439115
Recall   : 0.5554231227651967


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True