| # goal is to return spacy's NER output to the following format: ((line_start, token_start), (line_end, token_end)) - ENT
|
|
|
| import spacy
|
| nlp = spacy.load('en_core_web_sm')
|
|
|
| doc = '\n"The world?" asked Gideon Spilett.\n\n"No, the island. Some stones for ballast, a mast, and a sail, which\nthe captain will make for us some day, and we shall go splendidly!\nWell, captain--and you, Mr. Spilett; and you, Herbert; and you,\n'
|
|
|
| labels = ['PERSON', 'FAC', 'VEH', 'ORG', 'GPE', 'LOC']
|
|
|
| spacy_map = []
|
|
|
|
|
| lines = doc.text.split('\n')
|
| words = [line.split() for line in lines]
|
|
|
| for line_no, line in enumerate(lines):
|
| words = line.split()
|
| for ent_text, ent, start, end in zip(ents_text, ents, ents_start, ents_end):
|
| for w_i, word in enumerate(words):
|
| ent_words = ent_text.split()
|
| if word in ent_text or ent_text in word:
|
| print(line_no, w_i, ent_words, word)
|
| if len(ent_words) == 2:
|
| spacy_map.append(f'(({line_no}, {w_i}), ({line_no}, {w_i+1})) - {ent}')
|
| else:
|
| spacy_map.append(f'({line_no}, {w_i}) - {ent}')
|
|
|
| spacy_map = [i.replace('PERSON', 'PER') for i in spacy_map]
|
| print(spacy_map)
|