This paste expires on 2023-05-01 22:57:13.718349. Repaste, or download this paste. . Pasted through web.

# goal is to return spacy's NER output to the following format: ((line_start, token_start), (line_end, token_end)) - ENT
import spacy
nlp = spacy.load('en_core_web_sm')
doc = '\n"The world?" asked Gideon Spilett.\n\n"No, the island. Some stones for ballast, a mast, and a sail, which\nthe captain will make for us some day, and we shall go splendidly!\nWell, captain--and you, Mr. Spilett; and you, Herbert; and you,\n'
labels = ['PERSON', 'FAC', 'VEH', 'ORG', 'GPE', 'LOC']
ann_format = []
for ent in doc.ents:
    if ent.label_ in labels:
        ann_format.append(f'((line_start, {ent.start}), (line_end, {ent.end})) - {ent.label_}')
        print(f'{ent.text}, {ent.label_}')
Filename: None. Size: 698b. View raw, , hex, or download this file.
# expected output is a list of entities in the format specified above.
[
'((1, 7), (1, 9)) - PERSON',
'((5, 55), (5, 56)) - PERSON',
'((5, 60), (5, 61)) - PERSON'
]
Filename: None. Size: 171b. View raw, , hex, or download this file.