[named entities] Split core into preprocessors and filters modules, related to #187461

authorVincent Michel <vincent.michel@logilab.fr>
changeset102c6331f3f6
branchdefault
phasepublic
hiddenno
parent revision#77a3a4107f5c [named entities] Move tokenizer to utils and create a sources module for named entities, related to #187461
child revision#343a4304a259 [rename] Rename modules with shorter names, related to #187461
files modified by this revision
named_entities/__init__.py
named_entities/filters.py
named_entities/named_entities.py
named_entities/preprocessors.py
named_entities/sources.py
test/test_filter.py
test/test_filters.py
test/test_named_entities.py
test/test_preprocessor.py
test/test_preprocessors.py
# HG changeset patch
# User Vincent Michel <vincent.michel@logilab.fr>
# Date 1387464343 0
# Thu Dec 19 14:45:43 2013 +0000
# Node ID 102c6331f3f607c1aa25d9359644426e2e98aadc
# Parent 77a3a4107f5c930dca3228af0a0ec75ad224d538
[named entities] Split core into preprocessors and filters modules, related to #187461

diff --git a/named_entities/__init__.py b/named_entities/__init__.py
@@ -0,0 +1,80 @@
1 +# -*- coding: utf-8 -*-
2 +""" Process/Core functions for Named Entities Recognition.
3 +"""
4 +from nazca.utils.tokenizer import RichStringTokenizer
5 +
6 +
7 +###############################################################################
8 +### NER PROCESS ###############################################################
9 +###############################################################################
10 +class NerProcess(object):
11 +    """ High-level process for Named Entities Recognition
12 +    """
13 +
14 +    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
15 +        """ Initialise the class.
16 +
17 +        :tokenizer: an instance of tokenizer
18 +        """
19 +        self.ner_sources = list(ner_sources)
20 +        self.preprocessors = preprocessors or []
21 +        self.filters = filters or []
22 +        self.unique = unique
23 +
24 +    def add_ner_source(self, process):
25 +        """ Add a ner process
26 +        """
27 +        self.ner_sources.append(process)
28 +
29 +    def add_preprocessors(self, preprocessor):
30 +        """ Add a preprocessor
31 +        """
32 +        self.preprocessors.append(preprocessor)
33 +
34 +    def add_filters(self, filter):
35 +        """ Add a filter
36 +        """
37 +        self.filters.append(filter)
38 +
39 +    def process_text(self, text):
40 +        """ High level function for analyzing a text
41 +        """
42 +        tokenizer = RichStringTokenizer(text)
43 +        return self.recognize_tokens(tokenizer)
44 +
45 +    def recognize_tokens(self, tokens):
46 +        """ Recognize Named Entities from a tokenizer or
47 +        an iterator yielding tokens.
48 +        """
49 +        last_stop = 0
50 +        named_entities = []
51 +        for token in tokens:
52 +            if token.start < last_stop:
53 +                continue # this token overlaps with a previous match
54 +            word = token.word
55 +            # Applies preprocessors
56 +            # XXX Preprocessors may be sources dependant
57 +            for preprocessor in self.preprocessors:
58 +                token = preprocessor(token)
59 +                if not token:
60 +                    break
61 +            if not token:
62 +                continue
63 +            recognized = False
64 +            for process in self.ner_sources:
65 +                for uri in process.recognize_token(token):
66 +                    named_entities.append((uri, process.name, token))
67 +                    recognized = True
68 +                    last_stop = token.end
69 +                    if self.unique:
70 +                        break
71 +                if recognized and self.unique:
72 +                    break
73 +        # XXX Postprocess/filters may be sources dependant
74 +        return self.postprocess(named_entities)
75 +
76 +    def postprocess(self, named_entities):
77 +        """ Postprocess the results by applying filters """
78 +        for filter in self.filters:
79 +            named_entities = filter(named_entities)
80 +        return named_entities
diff --git a/named_entities/named_entities.py b/named_entities/filters.py
@@ -1,103 +1,23 @@
81  # -*- coding: utf-8 -*-
82 -""" Core functions for Named Entities Recognition.
83 +""" Filters for Named Entities Recognition.
84  """
85 -from nazca.utils.tokenizer import RichStringTokenizer, Token
86  from nazca.utils.dataio import sparqlquery
87 -from nazca.reference_data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
88 -
89 -STOPWORDS = {'fr': FRENCH_STOPWORDS,
90 -             'en': ENGLISH_STOPWORDS}
91 -
92 -
93 -###############################################################################
94 -### NER PREPROCESSORS #########################################################
95 -###############################################################################
96 -class AbstractNerdyPreprocessor(object):
97 -    """ Preprocessor
98 -    """
99 -
100 -    def __call__(self, token):
101 -        raise NotImplementedError
102 -
103 -
104 -class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
105 -    """ Remove token based on the size of the word
106 -    """
107 -    def __init__(self, min_size=None, max_size=None):
108 -        self.min_size = min_size
109 -        self.max_size = max_size
110 -
111 -    def __call__(self, token):
112 -        if ((self.min_size and len(token.word)<self.min_size)
113 -            or (self.max_size and len(token.word)>self.max_size)):
114 -            return None
115 -        return token
116 -
117 -
118 -class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
119 -    """ Remove token with word in lower case
120 -    """
121 -
122 -    def __call__(self, token):
123 -        return None if token.word.islower() else token
124 -
125 -
126 -class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
127 -    """ Lower the first word of each sentence if it is a stopword.
128 -    """
129 -    def __init__(self, lang='en'):
130 -        self.lang = lang
131 -
132 -    def __call__(self, token):
133 -        if (token.start == token.sentence.start and
134 -            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
135 -            word = token.word[0].lower() + token.word[1:]
136 -            return Token(word, token.start, token.end, token.sentence)
137 -        return token
138 -
139 -
140 -class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
141 -    """ Remove stopwords
142 -    """
143 -    def __init__(self, split_words=False, lang='en'):
144 -        self.split_words = split_words
145 -        self.lang = lang
146 -
147 -    def __call__(self, token):
148 -        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
149 -        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
150 -            return None
151 -        if not self.split_words and token.word.lower() in stopwords:
152 -            return None
153 -        return token
154 -
155 -
156 -class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
157 -    """ Cleanup hashtag
158 -    """
159 -    def __call__(self, token):
160 -        if token.word.startswith('@'):
161 -            # XXX Split capitalize letter ?
162 -            # @BarackObama -> Barack Obama
163 -            word = token.word[1:].replace('_', ' ')
164 -            return Token(word, token.start, token.end, token.sentence)
165 -        return token
166 
167 
168  ###############################################################################
169  ### NER FILTERS ###############################################################
170  ###############################################################################
171 -class AbstractNerdyFilter(object):
172 +class AbstractNerFilter(object):
173      """ A filter used for cleaning named entities results
174      """
175 
176      def __call__(self, named_entities):
177          raise NotImplementedError
178 
179 
180 -class NerdyOccurenceFilter(object):
181 +class NerOccurenceFilter(object):
182      """ A filter based on the number of occurence of
183      named entities in the results.
184      """
185      def __init__(self, min_occ=None, max_occ=None):
186          self.min_occ = min_occ
@@ -108,15 +28,15 @@
187          counts = dict([(u, uris.count(u)) for u in set(uris)])
188          return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
189                                                or (self.max_occ and counts[n[0]]>self.max_occ))]
190 
191 
192 -class NerdyRDFTypeFilter(object):
193 +class NerRDFTypeFilter(object):
194      """ A filter based on the RDF type on entity
195      E.g.
196 
197 -    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
198 +    filter = NerRDFTypeFilter('http://dbpedia.org/sparql',
199                                  ('http://schema.org/Place',
200                                  'http://dbpedia.org/ontology/Agent',
201                                  'http://dbpedia.org/ontology/Place'))
202 
203      """
@@ -141,11 +61,11 @@
204                      seen_uris[uri] = True
205                      filtered_named_entities.append((uri, p, t))
206          return filtered_named_entities
207 
208 
209 -class NerdyDisambiguationWordParts(object):
210 +class NerDisambiguationWordParts(object):
211      """ Disambiguate named entities based on the words parts.
212      E.g.:
213            'toto tutu': 'http://example.com/toto_tutu',
214            'toto': 'http://example.com/toto'
215 
@@ -167,11 +87,11 @@
216                  uri = parts[token.word]
217              filtered_named_entities.append((uri, peid, token))
218          return filtered_named_entities
219 
220 
221 -class NerdyReplacementRulesFilter(object):
222 +class NerReplacementRulesFilter(object):
223      """ Allow to define replacement rules for Named Entities
224      """
225      def __init__(self,rules):
226          self.rules = rules
227 
@@ -179,90 +99,5 @@
228          filtered_named_entities = []
229          for uri, peid, token in named_entities:
230              uri = self.rules.get(uri, uri)
231              filtered_named_entities.append((uri, peid, token))
232          return filtered_named_entities
233 -
234 -
235 -###############################################################################
236 -### NER PROCESS ###############################################################
237 -###############################################################################
238 -class NerdyProcess(object):
239 -    """ High-level process for Named Entities Recognition
240 -    """
241 -
242 -    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
243 -        """ Initialise the class.
244 -
245 -        :tokenizer: an instance of tokenizer
246 -        """
247 -        self.ner_sources = list(ner_sources)
248 -        self.preprocessors = preprocessors or []
249 -        self.filters = filters or []
250 -        self.unique = unique
251 -
252 -    def add_ner_source(self, process):
253 -        """ Add a ner process
254 -        """
255 -        self.ner_sources.append(process)
256 -
257 -    def add_preprocessors(self, preprocessor):
258 -        """ Add a preprocessor
259 -        """
260 -        self.preprocessors.append(preprocessor)
261 -
262 -    def add_filters(self, filter):
263 -        """ Add a filter
264 -        """
265 -        self.filters.append(filter)
266 -
267 -    def process_text(self, text):
268 -        """ High level function for analyzing a text
269 -        """
270 -        tokenizer = RichStringTokenizer(text)
271 -        return self.recognize_tokens(tokenizer)
272 -
273 -    def recognize_tokens(self, tokens):
274 -        """ Recognize Named Entities from a tokenizer or
275 -        an iterator yielding tokens.
276 -        """
277 -        last_stop = 0
278 -        named_entities = []
279 -        for token in tokens:
280 -            if token.start < last_stop:
281 -                continue # this token overlaps with a previous match
282 -            word = token.word
283 -            # Applies preprocessors
284 -            # XXX Preprocessors may be sources dependant
285 -            for preprocessor in self.preprocessors:
286 -                token = preprocessor(token)
287 -                if not token:
288 -                    break
289 -            if not token:
290 -                continue
291 -            recognized = False
292 -            for process in self.ner_sources:
293 -                for uri in process.recognize_token(token):
294 -                    named_entities.append((uri, process.name, token))
295 -                    recognized = True
296 -                    last_stop = token.end
297 -                    if self.unique:
298 -                        break
299 -                if recognized and self.unique:
300 -                    break
301 -        # XXX Postprocess/filters may be sources dependant
302 -        return self.postprocess(named_entities)
303 -
304 -    def postprocess(self, named_entities):
305 -        """ Postprocess the results by applying filters """
306 -        for filter in self.filters:
307 -            named_entities = filter(named_entities)
308 -        return named_entities
309 -
310 -
311 -###############################################################################
312 -### NER RELATIONS PROCESS #####################################################
313 -###############################################################################
314 -class NerdyRelationsProcess(object):
315 -    """ Process for building simple relation from named entities results
316 -    """
317 -    pass
diff --git a/named_entities/named_entities.py b/named_entities/preprocessors.py
@@ -1,28 +1,27 @@
318  # -*- coding: utf-8 -*-
319 -""" Core functions for Named Entities Recognition.
320 +""" Preprocessors for Named Entities Recognition.
321  """
322 -from nazca.utils.tokenizer import RichStringTokenizer, Token
323 -from nazca.utils.dataio import sparqlquery
324 +from nazca.utils.tokenizer import Token
325  from nazca.reference_data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
326 
327  STOPWORDS = {'fr': FRENCH_STOPWORDS,
328               'en': ENGLISH_STOPWORDS}
329 
330 
331  ###############################################################################
332  ### NER PREPROCESSORS #########################################################
333  ###############################################################################
334 -class AbstractNerdyPreprocessor(object):
335 +class AbstractNerPreprocessor(object):
336      """ Preprocessor
337      """
338 
339      def __call__(self, token):
340          raise NotImplementedError
341 
342 
343 -class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
344 +class NerWordSizeFilterPreprocessor(AbstractNerPreprocessor):
345      """ Remove token based on the size of the word
346      """
347      def __init__(self, min_size=None, max_size=None):
348          self.min_size = min_size
349          self.max_size = max_size
@@ -32,19 +31,19 @@
350              or (self.max_size and len(token.word)>self.max_size)):
351              return None
352          return token
353 
354 
355 -class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
356 +class NerLowerCaseFilterPreprocessor(AbstractNerPreprocessor):
357      """ Remove token with word in lower case
358      """
359 
360      def __call__(self, token):
361          return None if token.word.islower() else token
362 
363 
364 -class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
365 +class NerLowerFirstWordPreprocessor(AbstractNerPreprocessor):
366      """ Lower the first word of each sentence if it is a stopword.
367      """
368      def __init__(self, lang='en'):
369          self.lang = lang
370 
@@ -54,11 +53,11 @@
371              word = token.word[0].lower() + token.word[1:]
372              return Token(word, token.start, token.end, token.sentence)
373          return token
374 
375 
376 -class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
377 +class NerStopwordsFilterPreprocessor(AbstractNerPreprocessor):
378      """ Remove stopwords
379      """
380      def __init__(self, split_words=False, lang='en'):
381          self.split_words = split_words
382          self.lang = lang
@@ -70,199 +69,15 @@
383          if not self.split_words and token.word.lower() in stopwords:
384              return None
385          return token
386 
387 
388 -class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
389 +class NerHashTagPreprocessor(AbstractNerPreprocessor):
390      """ Cleanup hashtag
391      """
392      def __call__(self, token):
393          if token.word.startswith('@'):
394              # XXX Split capitalize letter ?
395              # @BarackObama -> Barack Obama
396              word = token.word[1:].replace('_', ' ')
397              return Token(word, token.start, token.end, token.sentence)
398          return token
399 -
400 -
401 -###############################################################################
402 -### NER FILTERS ###############################################################
403 -###############################################################################
404 -class AbstractNerdyFilter(object):
405 -    """ A filter used for cleaning named entities results
406 -    """
407 -
408 -    def __call__(self, named_entities):
409 -        raise NotImplementedError
410 -
411 -
412 -class NerdyOccurenceFilter(object):
413 -    """ A filter based on the number of occurence of
414 -    named entities in the results.
415 -    """
416 -    def __init__(self, min_occ=None, max_occ=None):
417 -        self.min_occ = min_occ
418 -        self.max_occ = max_occ
419 -
420 -    def __call__(self, named_entities):
421 -        uris = [u for u, p, t in named_entities]
422 -        counts = dict([(u, uris.count(u)) for u in set(uris)])
423 -        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
424 -                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
425 -
426 -
427 -class NerdyRDFTypeFilter(object):
428 -    """ A filter based on the RDF type on entity
429 -    E.g.
430 -
431 -    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
432 -                                ('http://schema.org/Place',
433 -                                'http://dbpedia.org/ontology/Agent',
434 -                                'http://dbpedia.org/ontology/Place'))
435 -
436 -    """
437 -    def __init__(self, endpoint, accepted_types):
438 -        self.endpoint = endpoint
439 -        self.accepted_types = accepted_types
440 -        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
441 -
442 -    def __call__(self, named_entities):
443 -        filtered_named_entities = []
444 -        seen_uris = {}
445 -        for uri, p, t in named_entities:
446 -            if uri in seen_uris:
447 -                if seen_uris[uri]:
448 -                    filtered_named_entities.append((uri, p, t))
449 -            else:
450 -                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
451 -                types = set([r['type']['value'] for r in results])
452 -                if not len(types.intersection(self.accepted_types)):
453 -                    seen_uris[uri] = False
454 -                else:
455 -                    seen_uris[uri] = True
456 -                    filtered_named_entities.append((uri, p, t))
457 -        return filtered_named_entities
458 -
459 -
460 -class NerdyDisambiguationWordParts(object):
461 -    """ Disambiguate named entities based on the words parts.
462 -    E.g.:
463 -          'toto tutu': 'http://example.com/toto_tutu',
464 -          'toto': 'http://example.com/toto'
465 -
466 -          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
467 -          by 'http://example.com/toto_tutu'
468 -    """
469 -    def __call__(self, named_entities):
470 -        # Create parts dictionnary
471 -        parts = {}
472 -        for uri, peid, token in named_entities:
473 -            if ' ' in token.word:
474 -                for part in token.word.split(' '):
475 -                    parts[part.lower()] = uri
476 -        # Replace named entities
477 -        filtered_named_entities = []
478 -        for uri, peid, token in named_entities:
479 -            if token.word in parts:
480 -                # Change URI
481 -                uri = parts[token.word]
482 -            filtered_named_entities.append((uri, peid, token))
483 -        return filtered_named_entities
484 -
485 -
486 -class NerdyReplacementRulesFilter(object):
487 -    """ Allow to define replacement rules for Named Entities
488 -    """
489 -    def __init__(self,rules):
490 -        self.rules = rules
491 -
492 -    def __call__(self, named_entities):
493 -        filtered_named_entities = []
494 -        for uri, peid, token in named_entities:
495 -            uri = self.rules.get(uri, uri)
496 -            filtered_named_entities.append((uri, peid, token))
497 -        return filtered_named_entities
498 -
499 -
500 -###############################################################################
501 -### NER PROCESS ###############################################################
502 -###############################################################################
503 -class NerdyProcess(object):
504 -    """ High-level process for Named Entities Recognition
505 -    """
506 -
507 -    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
508 -        """ Initialise the class.
509 -
510 -        :tokenizer: an instance of tokenizer
511 -        """
512 -        self.ner_sources = list(ner_sources)
513 -        self.preprocessors = preprocessors or []
514 -        self.filters = filters or []
515 -        self.unique = unique
516 -
517 -    def add_ner_source(self, process):
518 -        """ Add a ner process
519 -        """
520 -        self.ner_sources.append(process)
521 -
522 -    def add_preprocessors(self, preprocessor):
523 -        """ Add a preprocessor
524 -        """
525 -        self.preprocessors.append(preprocessor)
526 -
527 -    def add_filters(self, filter):
528 -        """ Add a filter
529 -        """
530 -        self.filters.append(filter)
531 -
532 -    def process_text(self, text):
533 -        """ High level function for analyzing a text
534 -        """
535 -        tokenizer = RichStringTokenizer(text)
536 -        return self.recognize_tokens(tokenizer)
537 -
538 -    def recognize_tokens(self, tokens):
539 -        """ Recognize Named Entities from a tokenizer or
540 -        an iterator yielding tokens.
541 -        """
542 -        last_stop = 0
543 -        named_entities = []
544 -        for token in tokens:
545 -            if token.start < last_stop:
546 -                continue # this token overlaps with a previous match
547 -            word = token.word
548 -            # Applies preprocessors
549 -            # XXX Preprocessors may be sources dependant
550 -            for preprocessor in self.preprocessors:
551 -                token = preprocessor(token)
552 -                if not token:
553 -                    break
554 -            if not token:
555 -                continue
556 -            recognized = False
557 -            for process in self.ner_sources:
558 -                for uri in process.recognize_token(token):
559 -                    named_entities.append((uri, process.name, token))
560 -                    recognized = True
561 -                    last_stop = token.end
562 -                    if self.unique:
563 -                        break
564 -                if recognized and self.unique:
565 -                    break
566 -        # XXX Postprocess/filters may be sources dependant
567 -        return self.postprocess(named_entities)
568 -
569 -    def postprocess(self, named_entities):
570 -        """ Postprocess the results by applying filters """
571 -        for filter in self.filters:
572 -            named_entities = filter(named_entities)
573 -        return named_entities
574 -
575 -
576 -###############################################################################
577 -### NER RELATIONS PROCESS #####################################################
578 -###############################################################################
579 -class NerdyRelationsProcess(object):
580 -    """ Process for building simple relation from named entities results
581 -    """
582 -    pass
diff --git a/named_entities/sources.py b/named_entities/sources.py
@@ -1,7 +1,7 @@
583  # -*- coding: utf-8 -*-
584 -""" Core functions for Named Entities Recognition.
585 +""" Sources for Named Entities Recognition.
586  """
587  from nazca.utils.tokenizer import Token
588  from nazca.utils.dataio import sparqlquery, rqlquery
589 
590 
@@ -101,11 +101,11 @@
591 
592  class NerSourceSparql(AbstractNerSource):
593      """ High-level source for Named Entities Recognition
594      SPARQL version
595 
596 -   >>> from nerdy.core import NerSourceSparql
597 +   >>> from ner.core import NerSourceSparql
598     >>> ner_source = NerSourceSparql('''SELECT ?uri
599                                           WHERE{
600                                           ?uri rdfs:label "%(word)s"@en}''',
601  			                 'http://dbpedia.org/sparql')
602     >>> print ner_source.recognize_token('Victor Hugo')
@@ -119,6 +119,6 @@
603      """
604 
605      def query_word(self, word):
606          """ Query a word for a Named Entities Recognition process
607          """
608 -        return [r['uri']['value'] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
609 +        return [r[0] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
diff --git a/test/test_filter.py b/test/test_filters.py
@@ -15,26 +15,27 @@
610  #
611  # You should have received a copy of the GNU Lesser General Public License along
612  # with this program. If not, see <http://www.gnu.org/licenses/>.
613  import unittest2
614 
615 -from nerdy import core
616 -from nerdy.tokenizer import Token, Sentence
617 +from nazca.named_entities import named_entities as core, filters
618 +from nazca.named_entities.sources import NerSourceLexicon
619 +from nazca.utils.tokenizer import Token, Sentence
620 
621 
622  class FilterTest(unittest2.TestCase):
623      """ Test of filters """
624 
625      def test_occurence_filter_min_occ(self):
626          """ Test occurence filter """
627          text = 'Hello everyone, this is   me speaking. And me.'
628 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
629 -                                          'me': 'http://example.com/me'})
630 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
631 -        _filter = core.NerdyOccurenceFilter(min_occ=2)
632 -        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
633 -        named_entities = nerdy.process_text(text)
634 +        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
635 +                                    'me': 'http://example.com/me'})
636 +        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
637 +        _filter = filters.NerOccurenceFilter(min_occ=2)
638 +        ner = core.NerProcess((source1, source2), filters=(_filter,))
639 +        named_entities = ner.process_text(text)
640          self.assertEqual(named_entities,
641                           [('http://example.com/me', None,
642                             Token(word='me', start=26, end=28,
643                                             sentence=Sentence(indice=0, start=0, end=38))),
644                            ('http://example2.com/me', None,
@@ -48,29 +49,29 @@
645                                             sentence=Sentence(indice=1, start=38, end=46)))])
646 
647      def test_occurence_filter_max_occ(self):
648          """ Test occurence filter """
649          text = 'Hello everyone, this is   me speaking. And me.'
650 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
651 -                                          'me': 'http://example.com/me'})
652 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
653 -        _filter = core.NerdyOccurenceFilter(max_occ=1)
654 -        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
655 -        named_entities = nerdy.process_text(text)
656 +        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
657 +                                    'me': 'http://example.com/me'})
658 +        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
659 +        _filter = filters.NerOccurenceFilter(max_occ=1)
660 +        ner = core.NerProcess((source1, source2), filters=(_filter,))
661 +        named_entities = ner.process_text(text)
662          self.assertEqual(named_entities,
663                           [('http://example.com/everyone', None,
664                             Token(word='everyone', start=6, end=14,
665                                             sentence=Sentence(indice=0, start=0, end=38))),])
666 
667      def test_disambiguation_word_length(self):
668          """ Test occurence filter """
669          text = 'Hello toto tutu. And toto.'
670 -        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
671 -                                          'toto': 'http://example.com/toto'})
672 -        _filter = core.NerdyDisambiguationWordParts()
673 -        nerdy = core.NerdyProcess((source,), filters=(_filter,))
674 -        named_entities = nerdy.process_text(text)
675 +        source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
676 +                                   'toto': 'http://example.com/toto'})
677 +        _filter = filters.NerDisambiguationWordParts()
678 +        ner = core.NerProcess((source,), filters=(_filter,))
679 +        named_entities = ner.process_text(text)
680          self.assertEqual(named_entities,
681                           [('http://example.com/toto_tutu', None,
682                             Token(word='toto tutu', start=6, end=15,
683                                   sentence=Sentence(indice=0, start=0, end=16))),
684                            ('http://example.com/toto_tutu', None,
@@ -78,16 +79,16 @@
685                                   sentence=Sentence(indice=1, start=16, end=26)))])
686 
687      def test_rules_filter(self):
688          """ Test rules filter """
689          text = 'Hello toto tutu. And toto.'
690 -        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
691 -                                          'toto': 'http://example.com/toto'})
692 +        source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
693 +                                   'toto': 'http://example.com/toto'})
694          rules = {'http://example.com/toto': 'http://example.com/tata'}
695 -        _filter = core.NerdyReplacementRulesFilter(rules)
696 -        nerdy = core.NerdyProcess((source,), filters=(_filter,))
697 -        named_entities = nerdy.process_text(text)
698 +        _filter = filters.NerReplacementRulesFilter(rules)
699 +        ner = core.NerProcess((source,), filters=(_filter,))
700 +        named_entities = ner.process_text(text)
701          self.assertEqual(named_entities,
702                           [('http://example.com/toto_tutu', None,
703                             Token(word='toto tutu', start=6, end=15,
704                                   sentence=Sentence(indice=0, start=0, end=16))),
705                            ('http://example.com/tata', None,
diff --git a/test/test_named_entities.py b/test/test_named_entities.py
@@ -18,16 +18,17 @@
706  import unittest2
707 
708  from nazca.named_entities.sources import (NerSourceLexicon,
709                                            NerSourceSparql,
710                                            NerSourceRql)
711 -from nazca.named_entities import named_entities as core
712 +from nazca.named_entities import NerProcess
713  from nazca.utils.tokenizer import Token, Sentence
714 +from nazca.named_entities.preprocessors import NerStopwordsFilterPreprocessor
715 
716 
717 -class CoreTest(unittest2.TestCase):
718 -    """ Test of core """
719 +class NerTest(unittest2.TestCase):
720 +    """ Test of Ner """
721 
722      def test_lexicon_source(self):
723          """ Test lexicon source """
724          lexicon = {'everyone': 'http://example.com/everyone',
725                     'me': 'http://example.com/me'}
@@ -49,25 +50,26 @@
726          self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
727 
728      def test_sparql_source(self):
729          """ Test sparql source """
730          source = NerSourceSparql(u'http://dbpedia.org/sparql',
731 -                                 u'''SELECT ?uri
732 +                                 u'''SELECT DISTINCT ?uri
733                                       WHERE{
734 -                                     ?uri rdfs:label "Python"@en .
735 +                                     ?uri rdfs:label "%(word)s"@en .
736                                       ?uri rdf:type ?type}''')
737 -        self.assertEqual(source.query_word('cubicweb'),
738 -                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
739 +        self.assertEqual(source.query_word('Python'),
740 +                         [u'http://dbpedia.org/resource/Python',
741 +                          u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
742                            u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
743 
744 -    def test_nerdy_process(self):
745 -        """ Test nerdy process """
746 +    def test_ner_process(self):
747 +        """ Test ner process """
748          text = 'Hello everyone, this is   me speaking. And me.'
749          source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
750                                     'me': 'http://example.com/me'})
751 -        nerdy = core.NerdyProcess((source,))
752 -        named_entities = nerdy.process_text(text)
753 +        ner = NerProcess((source,))
754 +        named_entities = ner.process_text(text)
755          self.assertEqual(named_entities,
756                           [('http://example.com/everyone', None,
757                             Token(word='everyone', start=6, end=14,
758                                             sentence=Sentence(indice=0, start=0, end=38))),
759                            ('http://example.com/me', None,
@@ -75,19 +77,19 @@
760                                             sentence=Sentence(indice=0, start=0, end=38))),
761                            ('http://example.com/me', None,
762                             Token(word='me', start=43, end=45,
763                                             sentence=Sentence(indice=1, start=38, end=46)))])
764 
765 -    def test_nerdy_process_multisources(self):
766 -        """ Test nerdy process """
767 +    def test_ner_process_multisources(self):
768 +        """ Test ner process """
769          text = 'Hello everyone, this is   me speaking. And me.'
770          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
771                                      'me': 'http://example.com/me'})
772          source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
773          # Two sources, not unique
774 -        nerdy = core.NerdyProcess((source1, source2))
775 -        named_entities = nerdy.process_text(text)
776 +        ner = NerProcess((source1, source2))
777 +        named_entities = ner.process_text(text)
778          self.assertEqual(named_entities,
779                           [('http://example.com/everyone', None,
780                             Token(word='everyone', start=6, end=14,
781                                             sentence=Sentence(indice=0, start=0, end=38))),
782                            ('http://example.com/me', None,
@@ -101,12 +103,12 @@
783                                             sentence=Sentence(indice=1, start=38, end=46))),
784                            ('http://example2.com/me', None,
785                             Token(word='me', start=43, end=45,
786                                             sentence=Sentence(indice=1, start=38, end=46)))])
787          # Two sources, unique
788 -        nerdy = core.NerdyProcess((source1, source2), unique=True)
789 -        named_entities = nerdy.process_text(text)
790 +        ner = NerProcess((source1, source2), unique=True)
791 +        named_entities = ner.process_text(text)
792          self.assertEqual(named_entities,
793                           [('http://example.com/everyone', None,
794                             Token(word='everyone', start=6, end=14,
795                                             sentence=Sentence(indice=0, start=0, end=38))),
796                            ('http://example.com/me', None,
@@ -114,12 +116,12 @@
797                                             sentence=Sentence(indice=0, start=0, end=38))),
798                            ('http://example.com/me', None,
799                             Token(word='me', start=43, end=45,
800                                             sentence=Sentence(indice=1, start=38, end=46)))])
801          # Two sources inversed, unique
802 -        nerdy = core.NerdyProcess((source2, source1), unique=True)
803 -        named_entities = nerdy.process_text(text)
804 +        ner = NerProcess((source2, source1), unique=True)
805 +        named_entities = ner.process_text(text)
806          self.assertEqual(named_entities,
807                           [('http://example.com/everyone', None,
808                             Token(word='everyone', start=6, end=14,
809                                             sentence=Sentence(indice=0, start=0, end=38))),
810                            ('http://example2.com/me', None,
@@ -127,18 +129,18 @@
811                                             sentence=Sentence(indice=0, start=0, end=38))),
812                            ('http://example2.com/me', None,
813                             Token(word='me', start=43, end=45,
814                                             sentence=Sentence(indice=1, start=38, end=46)))])
815 
816 -    def test_nerdy_process_add_sources(self):
817 -        """ Test nerdy process """
818 +    def test_ner_process_add_sources(self):
819 +        """ Test ner process """
820          text = 'Hello everyone, this is   me speaking. And me.'
821          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
822                                      'me': 'http://example.com/me'})
823          source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
824 -        nerdy = core.NerdyProcess((source1,))
825 -        named_entities = nerdy.process_text(text)
826 +        ner = NerProcess((source1,))
827 +        named_entities = ner.process_text(text)
828          self.assertEqual(named_entities,
829                           [('http://example.com/everyone', None,
830                             Token(word='everyone', start=6, end=14,
831                                             sentence=Sentence(indice=0, start=0, end=38))),
832                            ('http://example.com/me', None,
@@ -146,12 +148,12 @@
833                                             sentence=Sentence(indice=0, start=0, end=38))),
834                            ('http://example.com/me', None,
835                             Token(word='me', start=43, end=45,
836                                             sentence=Sentence(indice=1, start=38, end=46))),])
837          # Two sources, not unique
838 -        nerdy.add_ner_source(source2)
839 -        named_entities = nerdy.process_text(text)
840 +        ner.add_ner_source(source2)
841 +        named_entities = ner.process_text(text)
842          self.assertEqual(named_entities,
843                           [('http://example.com/everyone', None,
844                             Token(word='everyone', start=6, end=14,
845                                             sentence=Sentence(indice=0, start=0, end=38))),
846                            ('http://example.com/me', None,
@@ -165,55 +167,55 @@
847                                             sentence=Sentence(indice=1, start=38, end=46))),
848                            ('http://example2.com/me', None,
849                             Token(word='me', start=43, end=45,
850                                             sentence=Sentence(indice=1, start=38, end=46)))])
851 
852 -    def test_nerdy_process_preprocess(self):
853 -        """ Test nerdy process """
854 +    def test_ner_process_preprocess(self):
855 +        """ Test ner process """
856          text = 'Hello Toto, this is   me speaking. And me.'
857          source = NerSourceLexicon({'Toto': 'http://example.com/toto',
858                                     'me': 'http://example.com/me'})
859 -        preprocessor = core.NerdyStopwordsFilterPreprocessor()
860 -        nerdy = core.NerdyProcess((source,),
861 +        preprocessor = NerStopwordsFilterPreprocessor()
862 +        ner = NerProcess((source,),
863                                    preprocessors=(preprocessor,))
864 -        named_entities = nerdy.process_text(text)
865 +        named_entities = ner.process_text(text)
866          self.assertEqual(named_entities, [('http://example.com/toto', None,
867                                             Token(word='Toto', start=6, end=10,
868                                                   sentence=Sentence(indice=0, start=0, end=34)))])
869 
870 -    def test_nerdy_process_add_preprocess(self):
871 -        """ Test nerdy process """
872 +    def test_ner_process_add_preprocess(self):
873 +        """ Test ner process """
874          text = 'Hello Toto, this is   me speaking. And me.'
875          source = NerSourceLexicon({'Toto': 'http://example.com/toto',
876                                     'me': 'http://example.com/me'})
877 -        preprocessor = core.NerdyStopwordsFilterPreprocessor()
878 -        nerdy = core.NerdyProcess((source,),)
879 -        named_entities = nerdy.process_text(text)
880 +        preprocessor = NerStopwordsFilterPreprocessor()
881 +        ner = NerProcess((source,),)
882 +        named_entities = ner.process_text(text)
883          self.assertEqual(named_entities,
884                           [('http://example.com/toto', None,
885                             Token(word='Toto', start=6, end=10,
886                                   sentence=Sentence(indice=0, start=0, end=34))),
887                            ('http://example.com/me', None,
888                             Token(word='me', start=22, end=24,
889                                   sentence=Sentence(indice=0, start=0, end=34))),
890                            ('http://example.com/me', None,
891                             Token(word='me', start=39, end=41,
892                                   sentence=Sentence(indice=1, start=34, end=42)))])
893 -        nerdy.add_preprocessors(preprocessor)
894 -        named_entities = nerdy.process_text(text)
895 +        ner.add_preprocessors(preprocessor)
896 +        named_entities = ner.process_text(text)
897          self.assertEqual(named_entities, [('http://example.com/toto', None,
898                                             Token(word='Toto', start=6, end=10,
899                                                   sentence=Sentence(indice=0, start=0, end=34)))])
900 
901 -    def test_nerdy_process_chained_word(self):
902 -        """ Test nerdy process """
903 +    def test_ner_process_chained_word(self):
904 +        """ Test ner process """
905          text = 'Hello everyone me, this is   me speaking. And me.'
906          source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
907                                     'everyone me': 'http://example.com/everyone_me',
908                                     'me': 'http://example.com/me'})
909 -        nerdy = core.NerdyProcess((source,))
910 -        named_entities = nerdy.process_text(text)
911 +        ner = NerProcess((source,))
912 +        named_entities = ner.process_text(text)
913          self.assertEqual(named_entities,
914                           [('http://example.com/everyone_me', None,
915                             Token(word='everyone me', start=6, end=17,
916                                   sentence=Sentence(indice=0, start=0, end=41))),
917                            ('http://example.com/me', None,
diff --git a/test/test_preprocessor.py b/test/test_preprocessors.py
@@ -15,42 +15,43 @@
918  #
919  # You should have received a copy of the GNU Lesser General Public License along
920  # with this program. If not, see <http://www.gnu.org/licenses/>.
921  import unittest2
922 
923 -from nerdy import core, tokenizer
924 +from nazca.utils import tokenizer
925 +from nazca.named_entities import preprocessors
926 
927 
928  class PreprocessorTest(unittest2.TestCase):
929      """ Test of preprocessors """
930 
931      def test_lowercasefilter(self):
932 -        preprocessor = core.NerdyLowerCaseFilterPreprocessor()
933 +        preprocessor = preprocessors.NerLowerCaseFilterPreprocessor()
934          token = tokenizer.Token('toto', 0, 4, None)
935          self.assertEqual(preprocessor(token), None)
936          token = tokenizer.Token('toto Tata', 0, 4, None)
937          self.assertEqual(preprocessor(token), token)
938          token = tokenizer.Token('toto tata', 0, 4, None)
939          self.assertEqual(preprocessor(token), None)
940 
941      def test_wordsizefilter(self):
942 -        preprocessor = core.NerdyWordSizeFilterPreprocessor()
943 +        preprocessor = preprocessors.NerWordSizeFilterPreprocessor()
944          token = tokenizer.Token('toto', 0, 4, None)
945          self.assertEqual(preprocessor(token), token)
946 -        preprocessor = core.NerdyWordSizeFilterPreprocessor(min_size=3)
947 +        preprocessor = preprocessors.NerWordSizeFilterPreprocessor(min_size=3)
948          token = tokenizer.Token('toto', 0, 4, None)
949          self.assertEqual(preprocessor(token), token)
950          token = tokenizer.Token('to', 0, 4, None)
951          self.assertEqual(preprocessor(token), None)
952 -        preprocessor = core.NerdyWordSizeFilterPreprocessor(max_size=3)
953 +        preprocessor = preprocessors.NerWordSizeFilterPreprocessor(max_size=3)
954          token = tokenizer.Token('toto', 0, 4, None)
955          self.assertEqual(preprocessor(token), None)
956          token = tokenizer.Token('to', 0, 4, None)
957          self.assertEqual(preprocessor(token), token)
958 
959      def test_lowerfirstword(self):
960 -        preprocessor = core.NerdyLowerFirstWordPreprocessor()
961 +        preprocessor = preprocessors.NerLowerFirstWordPreprocessor()
962          sentence = tokenizer.Sentence(0, 0, 20)
963          # Start of the sentence
964          token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
965          token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
966          self.assertEqual(preprocessor(token1), token2)
@@ -64,26 +65,26 @@
967          token1 = tokenizer.Token('Us tata', 12, 16, sentence)
968          token2 = tokenizer.Token('Us tata', 12, 16, sentence)
969          self.assertEqual(preprocessor(token1), token2)
970 
971      def test_stopwordsfilter(self):
972 -        preprocessor = core.NerdyStopwordsFilterPreprocessor()
973 +        preprocessor = preprocessors.NerStopwordsFilterPreprocessor()
974          token = tokenizer.Token('Toto', 0, 4, None)
975          self.assertEqual(preprocessor(token), token)
976          token = tokenizer.Token('Us', 0, 4, None)
977          self.assertEqual(preprocessor(token), None)
978          token = tokenizer.Token('Us there', 0, 4, None)
979          self.assertEqual(preprocessor(token), token)
980          # Split words
981 -        preprocessor = core.NerdyStopwordsFilterPreprocessor(split_words=True)
982 +        preprocessor = preprocessors.NerStopwordsFilterPreprocessor(split_words=True)
983          token = tokenizer.Token('Us there', 0, 4, None)
984          self.assertEqual(preprocessor(token), None)
985          token = tokenizer.Token('Us there toto', 0, 4, None)
986          self.assertEqual(preprocessor(token), token)
987 
988      def test_hashtag(self):
989 -        preprocessor = core.NerdyHashTagPreprocessor()
990 +        preprocessor = preprocessors.NerHashTagPreprocessor()
991          token = tokenizer.Token('Toto', 0, 4, None)
992          self.assertEqual(preprocessor(token), token)
993          token1 = tokenizer.Token('@BarackObama', 0, 4, None)
994          token2 = tokenizer.Token('BarackObama', 0, 4, None)
995          self.assertEqual(preprocessor(token1), token2)