This paste expires on 2023-05-02 01:09:43.430127. Repaste, or download this paste. . Pasted through web.

# output is a list of entities in the format specified above.
1 3 ['Gideon', 'Spilett'] Gideon
1 4 ['Spilett'] Spilett.
5 4 ['Spilett'] Spilett;
5 7 ['Herbert'] Herbert;
['((1, 3), (1, 4)) - PER', '(1, 4) - PER', '(5, 4) - PER', '(5, 7) - PER']
Filename: None. Size: 251b. View raw, , hex, or download this file.
# goal is to return spacy's NER output to the following format: ((line_start, token_start), (line_end, token_end)) - ENT
import spacy
nlp = spacy.load('en_core_web_sm')
doc = '\n"The world?" asked Gideon Spilett.\n\n"No, the island. Some stones for ballast, a mast, and a sail, which\nthe captain will make for us some day, and we shall go splendidly!\nWell, captain--and you, Mr. Spilett; and you, Herbert; and you,\n'
labels = ['PERSON', 'FAC', 'VEH', 'ORG', 'GPE', 'LOC']
spacy_map = []
lines = doc.text.split('\n')
words = [line.split() for line in lines]
for line_no, line in enumerate(lines):
    words = line.split()
    for ent_text, ent, start, end in zip(ents_text, ents, ents_start, ents_end):
        for w_i, word in enumerate(words):
            ent_words = ent_text.split()
            if word in ent_text or ent_text in word:
                print(line_no, w_i, ent_words, word)
                if len(ent_words) == 2:
                    spacy_map.append(f'(({line_no}, {w_i}), ({line_no}, {w_i+1})) - {ent}')
                else:
                    spacy_map.append(f'({line_no}, {w_i}) - {ent}')
spacy_map = [i.replace('PERSON', 'PER') for i in spacy_map]
print(spacy_map)
Filename: None. Size: 1kb. View raw, , hex, or download this file.