[ner] Disambiguation word should be case-insensitive, closes #200147

authorVincent Michel <vincent.michel@logilab.fr>
changeset9494bce3a7a9
branchdefault
phasepublic
hiddenno
parent revision#f6b7eff50f7f [ner] Fix sparql results for types filtering after modifying in dataio
child revision#33aaa7c273c1 [data] Avoid .py files for data
files modified by this revision
ner/filters.py
test/test_filters.py
# HG changeset patch
# User Vincent Michel <vincent.michel@logilab.fr>
# Date 1389200059 0
# Wed Jan 08 16:54:19 2014 +0000
# Node ID 9494bce3a7a9990970bd94d5220116207842a233
# Parent f6b7eff50f7f62c7a74ec95697b7d3c7ddc16172
[ner] Disambiguation word should be case-insensitive, closes #200147

diff --git a/ner/filters.py b/ner/filters.py
@@ -80,13 +80,13 @@
1                  for part in token.word.split(' '):
2                      parts[part.lower()] = uri
3          # Replace named entities
4          filtered_named_entities = []
5          for uri, peid, token in named_entities:
6 -            if token.word in parts:
7 +            if token.word.lower() in parts:
8                  # Change URI
9 -                uri = parts[token.word]
10 +                uri = parts[token.word.lower()]
11              filtered_named_entities.append((uri, peid, token))
12          return filtered_named_entities
13 
14 
15  class NerReplacementRulesFilter(object):
diff --git a/test/test_filters.py b/test/test_filters.py
@@ -79,10 +79,26 @@
16                                   sentence=Sentence(indice=0, start=0, end=16))),
17                            ('http://example.com/toto_tutu', None,
18                             Token(word='toto', start=21, end=25,
19                                   sentence=Sentence(indice=1, start=16, end=26)))])
20 
21 +    def test_disambiguation_word_case(self):
22 +        """ Test occurence filter """
23 +        text = 'Hello Toto Tutu. And Toto.'
24 +        source = NerSourceLexicon({'Toto Tutu': 'http://example.com/toto_tutu',
25 +                                   'Toto': 'http://example.com/toto'})
26 +        _filter = NerDisambiguationWordParts()
27 +        ner = NerProcess((source,), filters=(_filter,))
28 +        named_entities = ner.process_text(text)
29 +        self.assertEqual(named_entities,
30 +                         [('http://example.com/toto_tutu', None,
31 +                           Token(word='Toto Tutu', start=6, end=15,
32 +                                 sentence=Sentence(indice=0, start=0, end=16))),
33 +                          ('http://example.com/toto_tutu', None,
34 +                           Token(word='Toto', start=21, end=25,
35 +                                 sentence=Sentence(indice=1, start=16, end=26)))])
36 +
37      def test_rules_filter(self):
38          """ Test rules filter """
39          text = 'Hello toto tutu. And toto.'
40          source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
41                                     'toto': 'http://example.com/toto'})