[named entities] Split core into preprocessors and filters modules, related to #187461

authorVincent Michel <vincent.michel@logilab.fr>
changeset6a0b643b9e78
branchdefault
phasedraft
hiddenyes
parent revision#cc142a884361 [named entities] Move tokenizer to utils and create a sources module for named entities, related to #187461
child revision#39ce88665867 [rename] Rename modules with shorter names, related to #187461
files modified by this revision
named_entities/__init__.py
named_entities/filters.py
named_entities/named_entities.py
named_entities/preprocessors.py
named_entities/sources.py
test/test_filter.py
test/test_filters.py
test/test_named_entities.py
test/test_preprocessor.py
test/test_preprocessors.py
# HG changeset patch
# User Vincent Michel <vincent.michel@logilab.fr>
# Date 1387464343 0
# Thu Dec 19 14:45:43 2013 +0000
# Node ID 6a0b643b9e78124b989162b20c3327dca0953a1e
# Parent cc142a884361048f68ddaf6a0c3ce76a6bd430ee
[named entities] Split core into preprocessors and filters modules, related to #187461

diff --git a/named_entities/__init__.py b/named_entities/__init__.py
@@ -0,0 +1,80 @@
1 +# -*- coding: utf-8 -*-
2 +""" Process/Core functions for Named Entities Recognition.
3 +"""
4 +from nazca.utils.tokenizer import RichStringTokenizer
5 +
6 +
7 +###############################################################################
8 +### NER PROCESS ###############################################################
9 +###############################################################################
10 +class NerProcess(object):
11 +    """ High-level process for Named Entities Recognition
12 +    """
13 +
14 +    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
15 +        """ Initialise the class.
16 +
17 +        :tokenizer: an instance of tokenizer
18 +        """
19 +        self.ner_sources = list(ner_sources)
20 +        self.preprocessors = preprocessors or []
21 +        self.filters = filters or []
22 +        self.unique = unique
23 +
24 +    def add_ner_source(self, process):
25 +        """ Add a ner process
26 +        """
27 +        self.ner_sources.append(process)
28 +
29 +    def add_preprocessors(self, preprocessor):
30 +        """ Add a preprocessor
31 +        """
32 +        self.preprocessors.append(preprocessor)
33 +
34 +    def add_filters(self, filter):
35 +        """ Add a filter
36 +        """
37 +        self.filters.append(filter)
38 +
39 +    def process_text(self, text):
40 +        """ High level function for analyzing a text
41 +        """
42 +        tokenizer = RichStringTokenizer(text)
43 +        return self.recognize_tokens(tokenizer)
44 +
45 +    def recognize_tokens(self, tokens):
46 +        """ Recognize Named Entities from a tokenizer or
47 +        an iterator yielding tokens.
48 +        """
49 +        last_stop = 0
50 +        named_entities = []
51 +        for token in tokens:
52 +            if token.start < last_stop:
53 +                continue # this token overlaps with a previous match
54 +            word = token.word
55 +            # Applies preprocessors
56 +            # XXX Preprocessors may be sources dependant
57 +            for preprocessor in self.preprocessors:
58 +                token = preprocessor(token)
59 +                if not token:
60 +                    break
61 +            if not token:
62 +                continue
63 +            recognized = False
64 +            for process in self.ner_sources:
65 +                for uri in process.recognize_token(token):
66 +                    named_entities.append((uri, process.name, token))
67 +                    recognized = True
68 +                    last_stop = token.end
69 +                    if self.unique:
70 +                        break
71 +                if recognized and self.unique:
72 +                    break
73 +        # XXX Postprocess/filters may be sources dependant
74 +        return self.postprocess(named_entities)
75 +
76 +    def postprocess(self, named_entities):
77 +        """ Postprocess the results by applying filters """
78 +        for filter in self.filters:
79 +            named_entities = filter(named_entities)
80 +        return named_entities
diff --git a/named_entities/filters.py b/named_entities/filters.py
@@ -0,0 +1,103 @@
81 +# -*- coding: utf-8 -*-
82 +""" Filters for Named Entities Recognition.
83 +"""
84 +from nazca.utils.dataio import sparqlquery
85 +
86 +
87 +###############################################################################
88 +### NER FILTERS ###############################################################
89 +###############################################################################
90 +class AbstractNerFilter(object):
91 +    """ A filter used for cleaning named entities results
92 +    """
93 +
94 +    def __call__(self, named_entities):
95 +        raise NotImplementedError
96 +
97 +
98 +class NerOccurenceFilter(object):
99 +    """ A filter based on the number of occurence of
100 +    named entities in the results.
101 +    """
102 +    def __init__(self, min_occ=None, max_occ=None):
103 +        self.min_occ = min_occ
104 +        self.max_occ = max_occ
105 +
106 +    def __call__(self, named_entities):
107 +        uris = [u for u, p, t in named_entities]
108 +        counts = dict([(u, uris.count(u)) for u in set(uris)])
109 +        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
110 +                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
111 +
112 +
113 +class NerRDFTypeFilter(object):
114 +    """ A filter based on the RDF type on entity
115 +    E.g.
116 +
117 +    filter = NerRDFTypeFilter('http://dbpedia.org/sparql',
118 +                                ('http://schema.org/Place',
119 +                                'http://dbpedia.org/ontology/Agent',
120 +                                'http://dbpedia.org/ontology/Place'))
121 +
122 +    """
123 +    def __init__(self, endpoint, accepted_types):
124 +        self.endpoint = endpoint
125 +        self.accepted_types = accepted_types
126 +        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
127 +
128 +    def __call__(self, named_entities):
129 +        filtered_named_entities = []
130 +        seen_uris = {}
131 +        for uri, p, t in named_entities:
132 +            if uri in seen_uris:
133 +                if seen_uris[uri]:
134 +                    filtered_named_entities.append((uri, p, t))
135 +            else:
136 +                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
137 +                types = set([r['type']['value'] for r in results])
138 +                if not len(types.intersection(self.accepted_types)):
139 +                    seen_uris[uri] = False
140 +                else:
141 +                    seen_uris[uri] = True
142 +                    filtered_named_entities.append((uri, p, t))
143 +        return filtered_named_entities
144 +
145 +
146 +class NerDisambiguationWordParts(object):
147 +    """ Disambiguate named entities based on the words parts.
148 +    E.g.:
149 +          'toto tutu': 'http://example.com/toto_tutu',
150 +          'toto': 'http://example.com/toto'
151 +
152 +          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
153 +          by 'http://example.com/toto_tutu'
154 +    """
155 +    def __call__(self, named_entities):
156 +        # Create parts dictionnary
157 +        parts = {}
158 +        for uri, peid, token in named_entities:
159 +            if ' ' in token.word:
160 +                for part in token.word.split(' '):
161 +                    parts[part.lower()] = uri
162 +        # Replace named entities
163 +        filtered_named_entities = []
164 +        for uri, peid, token in named_entities:
165 +            if token.word in parts:
166 +                # Change URI
167 +                uri = parts[token.word]
168 +            filtered_named_entities.append((uri, peid, token))
169 +        return filtered_named_entities
170 +
171 +
172 +class NerReplacementRulesFilter(object):
173 +    """ Allow to define replacement rules for Named Entities
174 +    """
175 +    def __init__(self,rules):
176 +        self.rules = rules
177 +
178 +    def __call__(self, named_entities):
179 +        filtered_named_entities = []
180 +        for uri, peid, token in named_entities:
181 +            uri = self.rules.get(uri, uri)
182 +            filtered_named_entities.append((uri, peid, token))
183 +        return filtered_named_entities
diff --git a/named_entities/named_entities.py b/named_entities/named_entities.py
@@ -1,268 +0,0 @@
184 -# -*- coding: utf-8 -*-
185 -""" Core functions for Named Entities Recognition.
186 -"""
187 -from nazca.utils.tokenizer import RichStringTokenizer, Token
188 -from nazca.utils.dataio import sparqlquery
189 -from nazca.reference_data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
190 -
191 -STOPWORDS = {'fr': FRENCH_STOPWORDS,
192 -             'en': ENGLISH_STOPWORDS}
193 -
194 -
195 -###############################################################################
196 -### NER PREPROCESSORS #########################################################
197 -###############################################################################
198 -class AbstractNerdyPreprocessor(object):
199 -    """ Preprocessor
200 -    """
201 -
202 -    def __call__(self, token):
203 -        raise NotImplementedError
204 -
205 -
206 -class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
207 -    """ Remove token based on the size of the word
208 -    """
209 -    def __init__(self, min_size=None, max_size=None):
210 -        self.min_size = min_size
211 -        self.max_size = max_size
212 -
213 -    def __call__(self, token):
214 -        if ((self.min_size and len(token.word)<self.min_size)
215 -            or (self.max_size and len(token.word)>self.max_size)):
216 -            return None
217 -        return token
218 -
219 -
220 -class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
221 -    """ Remove token with word in lower case
222 -    """
223 -
224 -    def __call__(self, token):
225 -        return None if token.word.islower() else token
226 -
227 -
228 -class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
229 -    """ Lower the first word of each sentence if it is a stopword.
230 -    """
231 -    def __init__(self, lang='en'):
232 -        self.lang = lang
233 -
234 -    def __call__(self, token):
235 -        if (token.start == token.sentence.start and
236 -            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
237 -            word = token.word[0].lower() + token.word[1:]
238 -            return Token(word, token.start, token.end, token.sentence)
239 -        return token
240 -
241 -
242 -class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
243 -    """ Remove stopwords
244 -    """
245 -    def __init__(self, split_words=False, lang='en'):
246 -        self.split_words = split_words
247 -        self.lang = lang
248 -
249 -    def __call__(self, token):
250 -        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
251 -        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
252 -            return None
253 -        if not self.split_words and token.word.lower() in stopwords:
254 -            return None
255 -        return token
256 -
257 -
258 -class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
259 -    """ Cleanup hashtag
260 -    """
261 -    def __call__(self, token):
262 -        if token.word.startswith('@'):
263 -            # XXX Split capitalize letter ?
264 -            # @BarackObama -> Barack Obama
265 -            word = token.word[1:].replace('_', ' ')
266 -            return Token(word, token.start, token.end, token.sentence)
267 -        return token
268 -
269 -
270 -###############################################################################
271 -### NER FILTERS ###############################################################
272 -###############################################################################
273 -class AbstractNerdyFilter(object):
274 -    """ A filter used for cleaning named entities results
275 -    """
276 -
277 -    def __call__(self, named_entities):
278 -        raise NotImplementedError
279 -
280 -
281 -class NerdyOccurenceFilter(object):
282 -    """ A filter based on the number of occurence of
283 -    named entities in the results.
284 -    """
285 -    def __init__(self, min_occ=None, max_occ=None):
286 -        self.min_occ = min_occ
287 -        self.max_occ = max_occ
288 -
289 -    def __call__(self, named_entities):
290 -        uris = [u for u, p, t in named_entities]
291 -        counts = dict([(u, uris.count(u)) for u in set(uris)])
292 -        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
293 -                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
294 -
295 -
296 -class NerdyRDFTypeFilter(object):
297 -    """ A filter based on the RDF type on entity
298 -    E.g.
299 -
300 -    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
301 -                                ('http://schema.org/Place',
302 -                                'http://dbpedia.org/ontology/Agent',
303 -                                'http://dbpedia.org/ontology/Place'))
304 -
305 -    """
306 -    def __init__(self, endpoint, accepted_types):
307 -        self.endpoint = endpoint
308 -        self.accepted_types = accepted_types
309 -        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
310 -
311 -    def __call__(self, named_entities):
312 -        filtered_named_entities = []
313 -        seen_uris = {}
314 -        for uri, p, t in named_entities:
315 -            if uri in seen_uris:
316 -                if seen_uris[uri]:
317 -                    filtered_named_entities.append((uri, p, t))
318 -            else:
319 -                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
320 -                types = set([r['type']['value'] for r in results])
321 -                if not len(types.intersection(self.accepted_types)):
322 -                    seen_uris[uri] = False
323 -                else:
324 -                    seen_uris[uri] = True
325 -                    filtered_named_entities.append((uri, p, t))
326 -        return filtered_named_entities
327 -
328 -
329 -class NerdyDisambiguationWordParts(object):
330 -    """ Disambiguate named entities based on the words parts.
331 -    E.g.:
332 -          'toto tutu': 'http://example.com/toto_tutu',
333 -          'toto': 'http://example.com/toto'
334 -
335 -          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
336 -          by 'http://example.com/toto_tutu'
337 -    """
338 -    def __call__(self, named_entities):
339 -        # Create parts dictionnary
340 -        parts = {}
341 -        for uri, peid, token in named_entities:
342 -            if ' ' in token.word:
343 -                for part in token.word.split(' '):
344 -                    parts[part.lower()] = uri
345 -        # Replace named entities
346 -        filtered_named_entities = []
347 -        for uri, peid, token in named_entities:
348 -            if token.word in parts:
349 -                # Change URI
350 -                uri = parts[token.word]
351 -            filtered_named_entities.append((uri, peid, token))
352 -        return filtered_named_entities
353 -
354 -
355 -class NerdyReplacementRulesFilter(object):
356 -    """ Allow to define replacement rules for Named Entities
357 -    """
358 -    def __init__(self,rules):
359 -        self.rules = rules
360 -
361 -    def __call__(self, named_entities):
362 -        filtered_named_entities = []
363 -        for uri, peid, token in named_entities:
364 -            uri = self.rules.get(uri, uri)
365 -            filtered_named_entities.append((uri, peid, token))
366 -        return filtered_named_entities
367 -
368 -
369 -###############################################################################
370 -### NER PROCESS ###############################################################
371 -###############################################################################
372 -class NerdyProcess(object):
373 -    """ High-level process for Named Entities Recognition
374 -    """
375 -
376 -    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
377 -        """ Initialise the class.
378 -
379 -        :tokenizer: an instance of tokenizer
380 -        """
381 -        self.ner_sources = list(ner_sources)
382 -        self.preprocessors = preprocessors or []
383 -        self.filters = filters or []
384 -        self.unique = unique
385 -
386 -    def add_ner_source(self, process):
387 -        """ Add a ner process
388 -        """
389 -        self.ner_sources.append(process)
390 -
391 -    def add_preprocessors(self, preprocessor):
392 -        """ Add a preprocessor
393 -        """
394 -        self.preprocessors.append(preprocessor)
395 -
396 -    def add_filters(self, filter):
397 -        """ Add a filter
398 -        """
399 -        self.filters.append(filter)
400 -
401 -    def process_text(self, text):
402 -        """ High level function for analyzing a text
403 -        """
404 -        tokenizer = RichStringTokenizer(text)
405 -        return self.recognize_tokens(tokenizer)
406 -
407 -    def recognize_tokens(self, tokens):
408 -        """ Recognize Named Entities from a tokenizer or
409 -        an iterator yielding tokens.
410 -        """
411 -        last_stop = 0
412 -        named_entities = []
413 -        for token in tokens:
414 -            if token.start < last_stop:
415 -                continue # this token overlaps with a previous match
416 -            word = token.word
417 -            # Applies preprocessors
418 -            # XXX Preprocessors may be sources dependant
419 -            for preprocessor in self.preprocessors:
420 -                token = preprocessor(token)
421 -                if not token:
422 -                    break
423 -            if not token:
424 -                continue
425 -            recognized = False
426 -            for process in self.ner_sources:
427 -                for uri in process.recognize_token(token):
428 -                    named_entities.append((uri, process.name, token))
429 -                    recognized = True
430 -                    last_stop = token.end
431 -                    if self.unique:
432 -                        break
433 -                if recognized and self.unique:
434 -                    break
435 -        # XXX Postprocess/filters may be sources dependant
436 -        return self.postprocess(named_entities)
437 -
438 -    def postprocess(self, named_entities):
439 -        """ Postprocess the results by applying filters """
440 -        for filter in self.filters:
441 -            named_entities = filter(named_entities)
442 -        return named_entities
443 -
444 -
445 -###############################################################################
446 -### NER RELATIONS PROCESS #####################################################
447 -###############################################################################
448 -class NerdyRelationsProcess(object):
449 -    """ Process for building simple relation from named entities results
450 -    """
451 -    pass
diff --git a/named_entities/preprocessors.py b/named_entities/preprocessors.py
@@ -0,0 +1,83 @@
452 +# -*- coding: utf-8 -*-
453 +""" Preprocessors for Named Entities Recognition.
454 +"""
455 +from nazca.utils.tokenizer import Token
456 +from nazca.reference_data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
457 +
458 +STOPWORDS = {'fr': FRENCH_STOPWORDS,
459 +             'en': ENGLISH_STOPWORDS}
460 +
461 +
462 +###############################################################################
463 +### NER PREPROCESSORS #########################################################
464 +###############################################################################
465 +class AbstractNerPreprocessor(object):
466 +    """ Preprocessor
467 +    """
468 +
469 +    def __call__(self, token):
470 +        raise NotImplementedError
471 +
472 +
473 +class NerWordSizeFilterPreprocessor(AbstractNerPreprocessor):
474 +    """ Remove token based on the size of the word
475 +    """
476 +    def __init__(self, min_size=None, max_size=None):
477 +        self.min_size = min_size
478 +        self.max_size = max_size
479 +
480 +    def __call__(self, token):
481 +        if ((self.min_size and len(token.word)<self.min_size)
482 +            or (self.max_size and len(token.word)>self.max_size)):
483 +            return None
484 +        return token
485 +
486 +
487 +class NerLowerCaseFilterPreprocessor(AbstractNerPreprocessor):
488 +    """ Remove token with word in lower case
489 +    """
490 +
491 +    def __call__(self, token):
492 +        return None if token.word.islower() else token
493 +
494 +
495 +class NerLowerFirstWordPreprocessor(AbstractNerPreprocessor):
496 +    """ Lower the first word of each sentence if it is a stopword.
497 +    """
498 +    def __init__(self, lang='en'):
499 +        self.lang = lang
500 +
501 +    def __call__(self, token):
502 +        if (token.start == token.sentence.start and
503 +            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
504 +            word = token.word[0].lower() + token.word[1:]
505 +            return Token(word, token.start, token.end, token.sentence)
506 +        return token
507 +
508 +
509 +class NerStopwordsFilterPreprocessor(AbstractNerPreprocessor):
510 +    """ Remove stopwords
511 +    """
512 +    def __init__(self, split_words=False, lang='en'):
513 +        self.split_words = split_words
514 +        self.lang = lang
515 +
516 +    def __call__(self, token):
517 +        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
518 +        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
519 +            return None
520 +        if not self.split_words and token.word.lower() in stopwords:
521 +            return None
522 +        return token
523 +
524 +
525 +class NerHashTagPreprocessor(AbstractNerPreprocessor):
526 +    """ Cleanup hashtag
527 +    """
528 +    def __call__(self, token):
529 +        if token.word.startswith('@'):
530 +            # XXX Split capitalize letter ?
531 +            # @BarackObama -> Barack Obama
532 +            word = token.word[1:].replace('_', ' ')
533 +            return Token(word, token.start, token.end, token.sentence)
534 +        return token
diff --git a/named_entities/sources.py b/named_entities/sources.py
@@ -1,7 +1,7 @@
535  # -*- coding: utf-8 -*-
536 -""" Core functions for Named Entities Recognition.
537 +""" Sources for Named Entities Recognition.
538  """
539  from nazca.utils.tokenizer import Token
540  from nazca.utils.dataio import sparqlquery, rqlquery
541 
542 
@@ -101,11 +101,11 @@
543 
544  class NerSourceSparql(AbstractNerSource):
545      """ High-level source for Named Entities Recognition
546      SPARQL version
547 
548 -   >>> from nerdy.core import NerSourceSparql
549 +   >>> from ner.core import NerSourceSparql
550     >>> ner_source = NerSourceSparql('''SELECT ?uri
551                                           WHERE{
552                                           ?uri rdfs:label "%(word)s"@en}''',
553  			                 'http://dbpedia.org/sparql')
554     >>> print ner_source.recognize_token('Victor Hugo')
@@ -119,6 +119,6 @@
555      """
556 
557      def query_word(self, word):
558          """ Query a word for a Named Entities Recognition process
559          """
560 -        return [r['uri']['value'] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
561 +        return [r[0] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
diff --git a/test/test_filter.py b/test/test_filter.py
@@ -1,99 +0,0 @@
562 -# -*- coding:utf-8 -*-
563 -#
564 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
565 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
566 -#
567 -# This program is free software: you can redistribute it and/or modify it under
568 -# the terms of the GNU Lesser General Public License as published by the Free
569 -# Software Foundation, either version 2.1 of the License, or (at your option)
570 -# any later version.
571 -#
572 -# This program is distributed in the hope that it will be useful, but WITHOUT
573 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
574 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
575 -# details.
576 -#
577 -# You should have received a copy of the GNU Lesser General Public License along
578 -# with this program. If not, see <http://www.gnu.org/licenses/>.
579 -import unittest2
580 -
581 -from nerdy import core
582 -from nerdy.tokenizer import Token, Sentence
583 -
584 -
585 -class FilterTest(unittest2.TestCase):
586 -    """ Test of filters """
587 -
588 -    def test_occurence_filter_min_occ(self):
589 -        """ Test occurence filter """
590 -        text = 'Hello everyone, this is   me speaking. And me.'
591 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
592 -                                          'me': 'http://example.com/me'})
593 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
594 -        _filter = core.NerdyOccurenceFilter(min_occ=2)
595 -        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
596 -        named_entities = nerdy.process_text(text)
597 -        self.assertEqual(named_entities,
598 -                         [('http://example.com/me', None,
599 -                           Token(word='me', start=26, end=28,
600 -                                           sentence=Sentence(indice=0, start=0, end=38))),
601 -                          ('http://example2.com/me', None,
602 -                           Token(word='me', start=26, end=28,
603 -                                           sentence=Sentence(indice=0, start=0, end=38))),
604 -                          ('http://example.com/me', None,
605 -                           Token(word='me', start=43, end=45,
606 -                                           sentence=Sentence(indice=1, start=38, end=46))),
607 -                          ('http://example2.com/me', None,
608 -                           Token(word='me', start=43, end=45,
609 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
610 -
611 -    def test_occurence_filter_max_occ(self):
612 -        """ Test occurence filter """
613 -        text = 'Hello everyone, this is   me speaking. And me.'
614 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
615 -                                          'me': 'http://example.com/me'})
616 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
617 -        _filter = core.NerdyOccurenceFilter(max_occ=1)
618 -        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
619 -        named_entities = nerdy.process_text(text)
620 -        self.assertEqual(named_entities,
621 -                         [('http://example.com/everyone', None,
622 -                           Token(word='everyone', start=6, end=14,
623 -                                           sentence=Sentence(indice=0, start=0, end=38))),])
624 -
625 -    def test_disambiguation_word_length(self):
626 -        """ Test occurence filter """
627 -        text = 'Hello toto tutu. And toto.'
628 -        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
629 -                                          'toto': 'http://example.com/toto'})
630 -        _filter = core.NerdyDisambiguationWordParts()
631 -        nerdy = core.NerdyProcess((source,), filters=(_filter,))
632 -        named_entities = nerdy.process_text(text)
633 -        self.assertEqual(named_entities,
634 -                         [('http://example.com/toto_tutu', None,
635 -                           Token(word='toto tutu', start=6, end=15,
636 -                                 sentence=Sentence(indice=0, start=0, end=16))),
637 -                          ('http://example.com/toto_tutu', None,
638 -                           Token(word='toto', start=21, end=25,
639 -                                 sentence=Sentence(indice=1, start=16, end=26)))])
640 -
641 -    def test_rules_filter(self):
642 -        """ Test rules filter """
643 -        text = 'Hello toto tutu. And toto.'
644 -        source = core.NerdySourceLexical({'toto tutu': 'http://example.com/toto_tutu',
645 -                                          'toto': 'http://example.com/toto'})
646 -        rules = {'http://example.com/toto': 'http://example.com/tata'}
647 -        _filter = core.NerdyReplacementRulesFilter(rules)
648 -        nerdy = core.NerdyProcess((source,), filters=(_filter,))
649 -        named_entities = nerdy.process_text(text)
650 -        self.assertEqual(named_entities,
651 -                         [('http://example.com/toto_tutu', None,
652 -                           Token(word='toto tutu', start=6, end=15,
653 -                                 sentence=Sentence(indice=0, start=0, end=16))),
654 -                          ('http://example.com/tata', None,
655 -                           Token(word='toto', start=21, end=25,
656 -                                 sentence=Sentence(indice=1, start=16, end=26)))])
657 -
658 -if __name__ == '__main__':
659 -    unittest2.main()
660 -
diff --git a/test/test_filters.py b/test/test_filters.py
@@ -0,0 +1,100 @@
661 +# -*- coding:utf-8 -*-
662 +#
663 +# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
664 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
665 +#
666 +# This program is free software: you can redistribute it and/or modify it under
667 +# the terms of the GNU Lesser General Public License as published by the Free
668 +# Software Foundation, either version 2.1 of the License, or (at your option)
669 +# any later version.
670 +#
671 +# This program is distributed in the hope that it will be useful, but WITHOUT
672 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
673 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
674 +# details.
675 +#
676 +# You should have received a copy of the GNU Lesser General Public License along
677 +# with this program. If not, see <http://www.gnu.org/licenses/>.
678 +import unittest2
679 +
680 +from nazca.named_entities import named_entities as core, filters
681 +from nazca.named_entities.sources import NerSourceLexicon
682 +from nazca.utils.tokenizer import Token, Sentence
683 +
684 +
685 +class FilterTest(unittest2.TestCase):
686 +    """ Test of filters """
687 +
688 +    def test_occurence_filter_min_occ(self):
689 +        """ Test occurence filter """
690 +        text = 'Hello everyone, this is   me speaking. And me.'
691 +        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
692 +                                    'me': 'http://example.com/me'})
693 +        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
694 +        _filter = filters.NerOccurenceFilter(min_occ=2)
695 +        ner = core.NerProcess((source1, source2), filters=(_filter,))
696 +        named_entities = ner.process_text(text)
697 +        self.assertEqual(named_entities,
698 +                         [('http://example.com/me', None,
699 +                           Token(word='me', start=26, end=28,
700 +                                           sentence=Sentence(indice=0, start=0, end=38))),
701 +                          ('http://example2.com/me', None,
702 +                           Token(word='me', start=26, end=28,
703 +                                           sentence=Sentence(indice=0, start=0, end=38))),
704 +                          ('http://example.com/me', None,
705 +                           Token(word='me', start=43, end=45,
706 +                                           sentence=Sentence(indice=1, start=38, end=46))),
707 +                          ('http://example2.com/me', None,
708 +                           Token(word='me', start=43, end=45,
709 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
710 +
711 +    def test_occurence_filter_max_occ(self):
712 +        """ Test occurence filter """
713 +        text = 'Hello everyone, this is   me speaking. And me.'
714 +        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
715 +                                    'me': 'http://example.com/me'})
716 +        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
717 +        _filter = filters.NerOccurenceFilter(max_occ=1)
718 +        ner = core.NerProcess((source1, source2), filters=(_filter,))
719 +        named_entities = ner.process_text(text)
720 +        self.assertEqual(named_entities,
721 +                         [('http://example.com/everyone', None,
722 +                           Token(word='everyone', start=6, end=14,
723 +                                           sentence=Sentence(indice=0, start=0, end=38))),])
724 +
725 +    def test_disambiguation_word_length(self):
726 +        """ Test occurence filter """
727 +        text = 'Hello toto tutu. And toto.'
728 +        source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
729 +                                   'toto': 'http://example.com/toto'})
730 +        _filter = filters.NerDisambiguationWordParts()
731 +        ner = core.NerProcess((source,), filters=(_filter,))
732 +        named_entities = ner.process_text(text)
733 +        self.assertEqual(named_entities,
734 +                         [('http://example.com/toto_tutu', None,
735 +                           Token(word='toto tutu', start=6, end=15,
736 +                                 sentence=Sentence(indice=0, start=0, end=16))),
737 +                          ('http://example.com/toto_tutu', None,
738 +                           Token(word='toto', start=21, end=25,
739 +                                 sentence=Sentence(indice=1, start=16, end=26)))])
740 +
741 +    def test_rules_filter(self):
742 +        """ Test rules filter """
743 +        text = 'Hello toto tutu. And toto.'
744 +        source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
745 +                                   'toto': 'http://example.com/toto'})
746 +        rules = {'http://example.com/toto': 'http://example.com/tata'}
747 +        _filter = filters.NerReplacementRulesFilter(rules)
748 +        ner = core.NerProcess((source,), filters=(_filter,))
749 +        named_entities = ner.process_text(text)
750 +        self.assertEqual(named_entities,
751 +                         [('http://example.com/toto_tutu', None,
752 +                           Token(word='toto tutu', start=6, end=15,
753 +                                 sentence=Sentence(indice=0, start=0, end=16))),
754 +                          ('http://example.com/tata', None,
755 +                           Token(word='toto', start=21, end=25,
756 +                                 sentence=Sentence(indice=1, start=16, end=26)))])
757 +
758 +if __name__ == '__main__':
759 +    unittest2.main()
760 +
diff --git a/test/test_named_entities.py b/test/test_named_entities.py
@@ -18,16 +18,17 @@
761  import unittest2
762 
763  from nazca.named_entities.sources import (NerSourceLexicon,
764                                            NerSourceSparql,
765                                            NerSourceRql)
766 -from nazca.named_entities import named_entities as core
767 +from nazca.named_entities import NerProcess
768  from nazca.utils.tokenizer import Token, Sentence
769 +from nazca.named_entities.preprocessors import NerStopwordsFilterPreprocessor
770 
771 
772 -class CoreTest(unittest2.TestCase):
773 -    """ Test of core """
774 +class NerTest(unittest2.TestCase):
775 +    """ Test of Ner """
776 
777      def test_lexicon_source(self):
778          """ Test lexicon source """
779          lexicon = {'everyone': 'http://example.com/everyone',
780                     'me': 'http://example.com/me'}
@@ -49,25 +50,26 @@
781          self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
782 
783      def test_sparql_source(self):
784          """ Test sparql source """
785          source = NerSourceSparql(u'http://dbpedia.org/sparql',
786 -                                 u'''SELECT ?uri
787 +                                 u'''SELECT DISTINCT ?uri
788                                       WHERE{
789 -                                     ?uri rdfs:label "Python"@en .
790 +                                     ?uri rdfs:label "%(word)s"@en .
791                                       ?uri rdf:type ?type}''')
792 -        self.assertEqual(source.query_word('cubicweb'),
793 -                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
794 +        self.assertEqual(source.query_word('Python'),
795 +                         [u'http://dbpedia.org/resource/Python',
796 +                          u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
797                            u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
798 
799 -    def test_nerdy_process(self):
800 -        """ Test nerdy process """
801 +    def test_ner_process(self):
802 +        """ Test ner process """
803          text = 'Hello everyone, this is   me speaking. And me.'
804          source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
805                                     'me': 'http://example.com/me'})
806 -        nerdy = core.NerdyProcess((source,))
807 -        named_entities = nerdy.process_text(text)
808 +        ner = NerProcess((source,))
809 +        named_entities = ner.process_text(text)
810          self.assertEqual(named_entities,
811                           [('http://example.com/everyone', None,
812                             Token(word='everyone', start=6, end=14,
813                                             sentence=Sentence(indice=0, start=0, end=38))),
814                            ('http://example.com/me', None,
@@ -75,19 +77,19 @@
815                                             sentence=Sentence(indice=0, start=0, end=38))),
816                            ('http://example.com/me', None,
817                             Token(word='me', start=43, end=45,
818                                             sentence=Sentence(indice=1, start=38, end=46)))])
819 
820 -    def test_nerdy_process_multisources(self):
821 -        """ Test nerdy process """
822 +    def test_ner_process_multisources(self):
823 +        """ Test ner process """
824          text = 'Hello everyone, this is   me speaking. And me.'
825          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
826                                      'me': 'http://example.com/me'})
827          source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
828          # Two sources, not unique
829 -        nerdy = core.NerdyProcess((source1, source2))
830 -        named_entities = nerdy.process_text(text)
831 +        ner = NerProcess((source1, source2))
832 +        named_entities = ner.process_text(text)
833          self.assertEqual(named_entities,
834                           [('http://example.com/everyone', None,
835                             Token(word='everyone', start=6, end=14,
836                                             sentence=Sentence(indice=0, start=0, end=38))),
837                            ('http://example.com/me', None,
@@ -101,12 +103,12 @@
838                                             sentence=Sentence(indice=1, start=38, end=46))),
839                            ('http://example2.com/me', None,
840                             Token(word='me', start=43, end=45,
841                                             sentence=Sentence(indice=1, start=38, end=46)))])
842          # Two sources, unique
843 -        nerdy = core.NerdyProcess((source1, source2), unique=True)
844 -        named_entities = nerdy.process_text(text)
845 +        ner = NerProcess((source1, source2), unique=True)
846 +        named_entities = ner.process_text(text)
847          self.assertEqual(named_entities,
848                           [('http://example.com/everyone', None,
849                             Token(word='everyone', start=6, end=14,
850                                             sentence=Sentence(indice=0, start=0, end=38))),
851                            ('http://example.com/me', None,
@@ -114,12 +116,12 @@
852                                             sentence=Sentence(indice=0, start=0, end=38))),
853                            ('http://example.com/me', None,
854                             Token(word='me', start=43, end=45,
855                                             sentence=Sentence(indice=1, start=38, end=46)))])
856          # Two sources inversed, unique
857 -        nerdy = core.NerdyProcess((source2, source1), unique=True)
858 -        named_entities = nerdy.process_text(text)
859 +        ner = NerProcess((source2, source1), unique=True)
860 +        named_entities = ner.process_text(text)
861          self.assertEqual(named_entities,
862                           [('http://example.com/everyone', None,
863                             Token(word='everyone', start=6, end=14,
864                                             sentence=Sentence(indice=0, start=0, end=38))),
865                            ('http://example2.com/me', None,
@@ -127,18 +129,18 @@
866                                             sentence=Sentence(indice=0, start=0, end=38))),
867                            ('http://example2.com/me', None,
868                             Token(word='me', start=43, end=45,
869                                             sentence=Sentence(indice=1, start=38, end=46)))])
870 
871 -    def test_nerdy_process_add_sources(self):
872 -        """ Test nerdy process """
873 +    def test_ner_process_add_sources(self):
874 +        """ Test ner process """
875          text = 'Hello everyone, this is   me speaking. And me.'
876          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
877                                      'me': 'http://example.com/me'})
878          source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
879 -        nerdy = core.NerdyProcess((source1,))
880 -        named_entities = nerdy.process_text(text)
881 +        ner = NerProcess((source1,))
882 +        named_entities = ner.process_text(text)
883          self.assertEqual(named_entities,
884                           [('http://example.com/everyone', None,
885                             Token(word='everyone', start=6, end=14,
886                                             sentence=Sentence(indice=0, start=0, end=38))),
887                            ('http://example.com/me', None,
@@ -146,12 +148,12 @@
888                                             sentence=Sentence(indice=0, start=0, end=38))),
889                            ('http://example.com/me', None,
890                             Token(word='me', start=43, end=45,
891                                             sentence=Sentence(indice=1, start=38, end=46))),])
892          # Two sources, not unique
893 -        nerdy.add_ner_source(source2)
894 -        named_entities = nerdy.process_text(text)
895 +        ner.add_ner_source(source2)
896 +        named_entities = ner.process_text(text)
897          self.assertEqual(named_entities,
898                           [('http://example.com/everyone', None,
899                             Token(word='everyone', start=6, end=14,
900                                             sentence=Sentence(indice=0, start=0, end=38))),
901                            ('http://example.com/me', None,
@@ -165,55 +167,55 @@
902                                             sentence=Sentence(indice=1, start=38, end=46))),
903                            ('http://example2.com/me', None,
904                             Token(word='me', start=43, end=45,
905                                             sentence=Sentence(indice=1, start=38, end=46)))])
906 
907 -    def test_nerdy_process_preprocess(self):
908 -        """ Test nerdy process """
909 +    def test_ner_process_preprocess(self):
910 +        """ Test ner process """
911          text = 'Hello Toto, this is   me speaking. And me.'
912          source = NerSourceLexicon({'Toto': 'http://example.com/toto',
913                                     'me': 'http://example.com/me'})
914 -        preprocessor = core.NerdyStopwordsFilterPreprocessor()
915 -        nerdy = core.NerdyProcess((source,),
916 +        preprocessor = NerStopwordsFilterPreprocessor()
917 +        ner = NerProcess((source,),
918                                    preprocessors=(preprocessor,))
919 -        named_entities = nerdy.process_text(text)
920 +        named_entities = ner.process_text(text)
921          self.assertEqual(named_entities, [('http://example.com/toto', None,
922                                             Token(word='Toto', start=6, end=10,
923                                                   sentence=Sentence(indice=0, start=0, end=34)))])
924 
925 -    def test_nerdy_process_add_preprocess(self):
926 -        """ Test nerdy process """
927 +    def test_ner_process_add_preprocess(self):
928 +        """ Test ner process """
929          text = 'Hello Toto, this is   me speaking. And me.'
930          source = NerSourceLexicon({'Toto': 'http://example.com/toto',
931                                     'me': 'http://example.com/me'})
932 -        preprocessor = core.NerdyStopwordsFilterPreprocessor()
933 -        nerdy = core.NerdyProcess((source,),)
934 -        named_entities = nerdy.process_text(text)
935 +        preprocessor = NerStopwordsFilterPreprocessor()
936 +        ner = NerProcess((source,),)
937 +        named_entities = ner.process_text(text)
938          self.assertEqual(named_entities,
939                           [('http://example.com/toto', None,
940                             Token(word='Toto', start=6, end=10,
941                                   sentence=Sentence(indice=0, start=0, end=34))),
942                            ('http://example.com/me', None,
943                             Token(word='me', start=22, end=24,
944                                   sentence=Sentence(indice=0, start=0, end=34))),
945                            ('http://example.com/me', None,
946                             Token(word='me', start=39, end=41,
947                                   sentence=Sentence(indice=1, start=34, end=42)))])
948 -        nerdy.add_preprocessors(preprocessor)
949 -        named_entities = nerdy.process_text(text)
950 +        ner.add_preprocessors(preprocessor)
951 +        named_entities = ner.process_text(text)
952          self.assertEqual(named_entities, [('http://example.com/toto', None,
953                                             Token(word='Toto', start=6, end=10,
954                                                   sentence=Sentence(indice=0, start=0, end=34)))])
955 
956 -    def test_nerdy_process_chained_word(self):
957 -        """ Test nerdy process """
958 +    def test_ner_process_chained_word(self):
959 +        """ Test ner process """
960          text = 'Hello everyone me, this is   me speaking. And me.'
961          source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
962                                     'everyone me': 'http://example.com/everyone_me',
963                                     'me': 'http://example.com/me'})
964 -        nerdy = core.NerdyProcess((source,))
965 -        named_entities = nerdy.process_text(text)
966 +        ner = NerProcess((source,))
967 +        named_entities = ner.process_text(text)
968          self.assertEqual(named_entities,
969                           [('http://example.com/everyone_me', None,
970                             Token(word='everyone me', start=6, end=17,
971                                   sentence=Sentence(indice=0, start=0, end=41))),
972                            ('http://example.com/me', None,
diff --git a/test/test_preprocessor.py b/test/test_preprocessor.py
@@ -1,97 +0,0 @@
973 -# -*- coding:utf-8 -*-
974 -#
975 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
976 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
977 -#
978 -# This program is free software: you can redistribute it and/or modify it under
979 -# the terms of the GNU Lesser General Public License as published by the Free
980 -# Software Foundation, either version 2.1 of the License, or (at your option)
981 -# any later version.
982 -#
983 -# This program is distributed in the hope that it will be useful, but WITHOUT
984 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
985 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
986 -# details.
987 -#
988 -# You should have received a copy of the GNU Lesser General Public License along
989 -# with this program. If not, see <http://www.gnu.org/licenses/>.
990 -import unittest2
991 -
992 -from nerdy import core, tokenizer
993 -
994 -
995 -class PreprocessorTest(unittest2.TestCase):
996 -    """ Test of preprocessors """
997 -
998 -    def test_lowercasefilter(self):
999 -        preprocessor = core.NerdyLowerCaseFilterPreprocessor()
1000 -        token = tokenizer.Token('toto', 0, 4, None)
1001 -        self.assertEqual(preprocessor(token), None)
1002 -        token = tokenizer.Token('toto Tata', 0, 4, None)
1003 -        self.assertEqual(preprocessor(token), token)
1004 -        token = tokenizer.Token('toto tata', 0, 4, None)
1005 -        self.assertEqual(preprocessor(token), None)
1006 -
1007 -    def test_wordsizefilter(self):
1008 -        preprocessor = core.NerdyWordSizeFilterPreprocessor()
1009 -        token = tokenizer.Token('toto', 0, 4, None)
1010 -        self.assertEqual(preprocessor(token), token)
1011 -        preprocessor = core.NerdyWordSizeFilterPreprocessor(min_size=3)
1012 -        token = tokenizer.Token('toto', 0, 4, None)
1013 -        self.assertEqual(preprocessor(token), token)
1014 -        token = tokenizer.Token('to', 0, 4, None)
1015 -        self.assertEqual(preprocessor(token), None)
1016 -        preprocessor = core.NerdyWordSizeFilterPreprocessor(max_size=3)
1017 -        token = tokenizer.Token('toto', 0, 4, None)
1018 -        self.assertEqual(preprocessor(token), None)
1019 -        token = tokenizer.Token('to', 0, 4, None)
1020 -        self.assertEqual(preprocessor(token), token)
1021 -
1022 -    def test_lowerfirstword(self):
1023 -        preprocessor = core.NerdyLowerFirstWordPreprocessor()
1024 -        sentence = tokenizer.Sentence(0, 0, 20)
1025 -        # Start of the sentence
1026 -        token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
1027 -        token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
1028 -        self.assertEqual(preprocessor(token1), token2)
1029 -        token1 = tokenizer.Token('Us tata', 0, 4, sentence)
1030 -        token2 = tokenizer.Token('us tata', 0, 4, sentence)
1031 -        self.assertEqual(preprocessor(token1), token2)
1032 -        # Not start of the sentence
1033 -        token1 = tokenizer.Token('Toto tata', 12, 16, sentence)
1034 -        token2 = tokenizer.Token('Toto tata', 12, 16, sentence)
1035 -        self.assertEqual(preprocessor(token1), token2)
1036 -        token1 = tokenizer.Token('Us tata', 12, 16, sentence)
1037 -        token2 = tokenizer.Token('Us tata', 12, 16, sentence)
1038 -        self.assertEqual(preprocessor(token1), token2)
1039 -
1040 -    def test_stopwordsfilter(self):
1041 -        preprocessor = core.NerdyStopwordsFilterPreprocessor()
1042 -        token = tokenizer.Token('Toto', 0, 4, None)
1043 -        self.assertEqual(preprocessor(token), token)
1044 -        token = tokenizer.Token('Us', 0, 4, None)
1045 -        self.assertEqual(preprocessor(token), None)
1046 -        token = tokenizer.Token('Us there', 0, 4, None)
1047 -        self.assertEqual(preprocessor(token), token)
1048 -        # Split words
1049 -        preprocessor = core.NerdyStopwordsFilterPreprocessor(split_words=True)
1050 -        token = tokenizer.Token('Us there', 0, 4, None)
1051 -        self.assertEqual(preprocessor(token), None)
1052 -        token = tokenizer.Token('Us there toto', 0, 4, None)
1053 -        self.assertEqual(preprocessor(token), token)
1054 -
1055 -    def test_hashtag(self):
1056 -        preprocessor = core.NerdyHashTagPreprocessor()
1057 -        token = tokenizer.Token('Toto', 0, 4, None)
1058 -        self.assertEqual(preprocessor(token), token)
1059 -        token1 = tokenizer.Token('@BarackObama', 0, 4, None)
1060 -        token2 = tokenizer.Token('BarackObama', 0, 4, None)
1061 -        self.assertEqual(preprocessor(token1), token2)
1062 -        token1 = tokenizer.Token('@Barack_Obama', 0, 4, None)
1063 -        token2 = tokenizer.Token('Barack Obama', 0, 4, None)
1064 -        self.assertEqual(preprocessor(token1), token2)
1065 -
1066 -
1067 -if __name__ == '__main__':
1068 -    unittest2.main()
1069 -
diff --git a/test/test_preprocessors.py b/test/test_preprocessors.py
@@ -0,0 +1,98 @@
1070 +# -*- coding:utf-8 -*-
1071 +#
1072 +# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1073 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1074 +#
1075 +# This program is free software: you can redistribute it and/or modify it under
1076 +# the terms of the GNU Lesser General Public License as published by the Free
1077 +# Software Foundation, either version 2.1 of the License, or (at your option)
1078 +# any later version.
1079 +#
1080 +# This program is distributed in the hope that it will be useful, but WITHOUT
1081 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1082 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1083 +# details.
1084 +#
1085 +# You should have received a copy of the GNU Lesser General Public License along
1086 +# with this program. If not, see <http://www.gnu.org/licenses/>.
1087 +import unittest2
1088 +
1089 +from nazca.utils import tokenizer
1090 +from nazca.named_entities import preprocessors
1091 +
1092 +
1093 +class PreprocessorTest(unittest2.TestCase):
1094 +    """ Test of preprocessors """
1095 +
1096 +    def test_lowercasefilter(self):
1097 +        preprocessor = preprocessors.NerLowerCaseFilterPreprocessor()
1098 +        token = tokenizer.Token('toto', 0, 4, None)
1099 +        self.assertEqual(preprocessor(token), None)
1100 +        token = tokenizer.Token('toto Tata', 0, 4, None)
1101 +        self.assertEqual(preprocessor(token), token)
1102 +        token = tokenizer.Token('toto tata', 0, 4, None)
1103 +        self.assertEqual(preprocessor(token), None)
1104 +
1105 +    def test_wordsizefilter(self):
1106 +        preprocessor = preprocessors.NerWordSizeFilterPreprocessor()
1107 +        token = tokenizer.Token('toto', 0, 4, None)
1108 +        self.assertEqual(preprocessor(token), token)
1109 +        preprocessor = preprocessors.NerWordSizeFilterPreprocessor(min_size=3)
1110 +        token = tokenizer.Token('toto', 0, 4, None)
1111 +        self.assertEqual(preprocessor(token), token)
1112 +        token = tokenizer.Token('to', 0, 4, None)
1113 +        self.assertEqual(preprocessor(token), None)
1114 +        preprocessor = preprocessors.NerWordSizeFilterPreprocessor(max_size=3)
1115 +        token = tokenizer.Token('toto', 0, 4, None)
1116 +        self.assertEqual(preprocessor(token), None)
1117 +        token = tokenizer.Token('to', 0, 4, None)
1118 +        self.assertEqual(preprocessor(token), token)
1119 +
1120 +    def test_lowerfirstword(self):
1121 +        preprocessor = preprocessors.NerLowerFirstWordPreprocessor()
1122 +        sentence = tokenizer.Sentence(0, 0, 20)
1123 +        # Start of the sentence
1124 +        token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
1125 +        token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
1126 +        self.assertEqual(preprocessor(token1), token2)
1127 +        token1 = tokenizer.Token('Us tata', 0, 4, sentence)
1128 +        token2 = tokenizer.Token('us tata', 0, 4, sentence)
1129 +        self.assertEqual(preprocessor(token1), token2)
1130 +        # Not start of the sentence
1131 +        token1 = tokenizer.Token('Toto tata', 12, 16, sentence)
1132 +        token2 = tokenizer.Token('Toto tata', 12, 16, sentence)
1133 +        self.assertEqual(preprocessor(token1), token2)
1134 +        token1 = tokenizer.Token('Us tata', 12, 16, sentence)
1135 +        token2 = tokenizer.Token('Us tata', 12, 16, sentence)
1136 +        self.assertEqual(preprocessor(token1), token2)
1137 +
1138 +    def test_stopwordsfilter(self):
1139 +        preprocessor = preprocessors.NerStopwordsFilterPreprocessor()
1140 +        token = tokenizer.Token('Toto', 0, 4, None)
1141 +        self.assertEqual(preprocessor(token), token)
1142 +        token = tokenizer.Token('Us', 0, 4, None)
1143 +        self.assertEqual(preprocessor(token), None)
1144 +        token = tokenizer.Token('Us there', 0, 4, None)
1145 +        self.assertEqual(preprocessor(token), token)
1146 +        # Split words
1147 +        preprocessor = preprocessors.NerStopwordsFilterPreprocessor(split_words=True)
1148 +        token = tokenizer.Token('Us there', 0, 4, None)
1149 +        self.assertEqual(preprocessor(token), None)
1150 +        token = tokenizer.Token('Us there toto', 0, 4, None)
1151 +        self.assertEqual(preprocessor(token), token)
1152 +
1153 +    def test_hashtag(self):
1154 +        preprocessor = preprocessors.NerHashTagPreprocessor()
1155 +        token = tokenizer.Token('Toto', 0, 4, None)
1156 +        self.assertEqual(preprocessor(token), token)
1157 +        token1 = tokenizer.Token('@BarackObama', 0, 4, None)
1158 +        token2 = tokenizer.Token('BarackObama', 0, 4, None)
1159 +        self.assertEqual(preprocessor(token1), token2)
1160 +        token1 = tokenizer.Token('@Barack_Obama', 0, 4, None)
1161 +        token2 = tokenizer.Token('Barack Obama', 0, 4, None)
1162 +        self.assertEqual(preprocessor(token1), token2)
1163 +
1164 +
1165 +if __name__ == '__main__':
1166 +    unittest2.main()
1167 +