[ner] Cleanup Nerdy, related to #187461

authorvincent.michel@logilab.fr
changeset00d352769ba0
branchdefault
phasedraft
hiddenyes
parent revision#36951167576c [pkginfo] Rename ner in named_entities and update pkginfo, related to #187461
child revision#ec7d7ce1ca35 [dataio] Merge dataio and tests, related to #187461
files modified by this revision
named_entities/core.py
named_entities/named_entities.py
test/test_core.py
test/test_named_entities.py
# HG changeset patch
# User vincent.michel@logilab.fr
# Date 1387464314 0
# Thu Dec 19 14:45:14 2013 +0000
# Node ID 00d352769ba0ff7f0d1e8366421322ad98af7e6b
# Parent 36951167576cdf576b711e1005726ac054611299
[ner] Cleanup Nerdy, related to #187461

diff --git a/named_entities/core.py b/named_entities/core.py
@@ -1,396 +0,0 @@
1 -# -*- coding: utf-8 -*-
2 -""" Core functions for Named Entities Recognition.
3 -"""
4 -from nerdy.tokenizer import RichStringTokenizer, Token
5 -from nerdy.dataio import sparql_query, rql_url_query, rql_appid_query
6 -from nerdy.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
7 -
8 -STOPWORDS = {'fr': FRENCH_STOPWORDS,
9 -             'en': ENGLISH_STOPWORDS}
10 -
11 -# XXX Add SQL source ?
12 -# XXX NER preprocessor
13 -
14 -###############################################################################
15 -### NER SOURCE ################################################################
16 -###############################################################################
17 -class AbstractNerdySource(object):
18 -    """ High-level source for Named Entities Recognition
19 -    """
20 -
21 -    def __init__(self, query, endpoint, name=None, use_cache=True, preprocessors=None):
22 -        """ Initialise the class.
23 -        """
24 -        self.query = query
25 -        self.endpoint = endpoint
26 -        self.name = name
27 -        self.preprocessors = preprocessors or []
28 -        self.use_cache = use_cache
29 -        self._recognized_cache = {}
30 -
31 -    def add_preprocessors(self, preprocessor):
32 -        """ Add a preprocessor
33 -        """
34 -        self.preprocessors.append(preprocessor)
35 -
36 -    def recognize_token(self, token):
37 -        """ Recognize a token
38 -        """
39 -        # Applies source specific preprocessors
40 -        for preprocessor in self.preprocessors:
41 -            token = preprocessor(token)
42 -            if not token:
43 -                return []
44 -        if self.use_cache and token.word in self._recognized_cache:
45 -            return self._recognized_cache[token.word]
46 -        uris = self.query_word(token.word) if token.word else []
47 -        if self.use_cache:
48 -            self._recognized_cache[token.word] = uris
49 -        return uris
50 -
51 -    def query_word(self, word):
52 -        """ Query a word for a Named Entities Recognition process
53 -        """
54 -        raise NotImplementedError
55 -
56 -
57 -class NerdySourceLexical(AbstractNerdySource):
58 -    """ Source based on a (pre-computed) dictionnary of words (token, uri)
59 -    """
60 -    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
61 -        self.lexicon = lexicon
62 -        self.name = name
63 -        self.preprocessors = preprocessors or []
64 -        self.use_cache = use_cache
65 -        self._recognized_cache = {}
66 -
67 -    def query_word(self, word):
68 -        uri = self.lexicon.get(word)
69 -        return [uri,] if uri else []
70 -
71 -
72 -class NerdySourceLocalRql(AbstractNerdySource):
73 -    """ High-level source for Named Entities Recognition
74 -    Local RQL version
75 -    """
76 -
77 -    def __init__(self, query, session, name=None, use_cache=True, preprocessors=None):
78 -        """ Initialise the class.
79 -        """
80 -        self.query = query
81 -        self.session = session
82 -        self.name = name
83 -        self.preprocessors = preprocessors or []
84 -        self.use_cache = use_cache
85 -        self._recognized_cache = {}
86 -
87 -    def query_word(self, word):
88 -        """ Query a word for a Named Entities Recognition process
89 -        """
90 -        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
91 -
92 -
93 -class NerdySourceAppidRql(AbstractNerdySource):
94 -    """ High-level source for Named Entities Recognition
95 -    Appid RQL version
96 -    """
97 -
98 -    def query_word(self, word):
99 -        """ Query a word for a Named Entities Recognition process
100 -        """
101 -        return [r[0] for r in rql_appid_query(self.query, self.endpoint, word=word)]
102 -
103 -
104 -class NerdySourceUrlRql(AbstractNerdySource):
105 -    """ High-level source for Named Entities Recognition
106 -    Url RQL version
107 -    """
108 -
109 -    def query_word(self, word):
110 -        """ Query a word for a Named Entities Recognition process
111 -        """
112 -        return [r[0] for r in rql_url_query(self.query % {'word': word}, self.endpoint)]
113 -
114 -
115 -class NerdySourceSparql(AbstractNerdySource):
116 -    """ High-level source for Named Entities Recognition
117 -    SPARQL version
118 -
119 -   >>> from nerdy.core import NerdySourceSparql
120 -   >>> ner_source = NerdySourceSparql('''SELECT ?uri
121 -                                         WHERE{
122 -                                         ?uri rdfs:label "%(word)s"@en}''',
123 -			                 'http://dbpedia.org/sparql')
124 -   >>> print ner_source.recognize_token('Victor Hugo')
125 -		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
126 -		     'http://dbpedia.org/resource/Victor_Hugo',
127 -		     'http://dbpedia.org/class/yago/VictorHugo',
128 -		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
129 -		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
130 -		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
131 -
132 -    """
133 -
134 -    def query_word(self, word):
135 -        """ Query a word for a Named Entities Recognition process
136 -        """
137 -        return [r['uri']['value'] for r in sparql_query(self.query % {'word': word}, self.endpoint)]
138 -
139 -
140 -###############################################################################
141 -### NER PREPROCESSORS #########################################################
142 -###############################################################################
143 -class AbstractNerdyPreprocessor(object):
144 -    """ Preprocessor
145 -    """
146 -
147 -    def __call__(self, token):
148 -        raise NotImplementedError
149 -
150 -
151 -class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
152 -    """ Remove token based on the size of the word
153 -    """
154 -    def __init__(self, min_size=None, max_size=None):
155 -        self.min_size = min_size
156 -        self.max_size = max_size
157 -
158 -    def __call__(self, token):
159 -        if ((self.min_size and len(token.word)<self.min_size)
160 -            or (self.max_size and len(token.word)>self.max_size)):
161 -            return None
162 -        return token
163 -
164 -
165 -class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
166 -    """ Remove token with word in lower case
167 -    """
168 -
169 -    def __call__(self, token):
170 -        return None if token.word.islower() else token
171 -
172 -
173 -class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
174 -    """ Lower the first word of each sentence if it is a stopword.
175 -    """
176 -    def __init__(self, lang='en'):
177 -        self.lang = lang
178 -
179 -    def __call__(self, token):
180 -        if (token.start == token.sentence.start and
181 -            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
182 -            word = token.word[0].lower() + token.word[1:]
183 -            return Token(word, token.start, token.end, token.sentence)
184 -        return token
185 -
186 -
187 -class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
188 -    """ Remove stopwords
189 -    """
190 -    def __init__(self, split_words=False, lang='en'):
191 -        self.split_words = split_words
192 -        self.lang = lang
193 -
194 -    def __call__(self, token):
195 -        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
196 -        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
197 -            return None
198 -        if not self.split_words and token.word.lower() in stopwords:
199 -            return None
200 -        return token
201 -
202 -
203 -class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
204 -    """ Cleanup hashtag
205 -    """
206 -    def __call__(self, token):
207 -        if token.word.startswith('@'):
208 -            # XXX Split capitalize letter ?
209 -            # @BarackObama -> Barack Obama
210 -            word = token.word[1:].replace('_', ' ')
211 -            return Token(word, token.start, token.end, token.sentence)
212 -        return token
213 -
214 -
215 -###############################################################################
216 -### NER FILTERS ###############################################################
217 -###############################################################################
218 -class AbstractNerdyFilter(object):
219 -    """ A filter used for cleaning named entities results
220 -    """
221 -
222 -    def __call__(self, named_entities):
223 -        raise NotImplementedError
224 -
225 -
226 -class NerdyOccurenceFilter(object):
227 -    """ A filter based on the number of occurence of
228 -    named entities in the results.
229 -    """
230 -    def __init__(self, min_occ=None, max_occ=None):
231 -        self.min_occ = min_occ
232 -        self.max_occ = max_occ
233 -
234 -    def __call__(self, named_entities):
235 -        uris = [u for u, p, t in named_entities]
236 -        counts = dict([(u, uris.count(u)) for u in set(uris)])
237 -        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
238 -                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
239 -
240 -
241 -class NerdyRDFTypeFilter(object):
242 -    """ A filter based on the RDF type on entity
243 -    E.g.
244 -
245 -    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
246 -                                ('http://schema.org/Place',
247 -                                'http://dbpedia.org/ontology/Agent',
248 -                                'http://dbpedia.org/ontology/Place'))
249 -
250 -    """
251 -    def __init__(self, endpoint, accepted_types):
252 -        self.endpoint = endpoint
253 -        self.accepted_types = accepted_types
254 -        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
255 -
256 -    def __call__(self, named_entities):
257 -        filtered_named_entities = []
258 -        seen_uris = {}
259 -        for uri, p, t in named_entities:
260 -            if uri in seen_uris:
261 -                if seen_uris[uri]:
262 -                    filtered_named_entities.append((uri, p, t))
263 -            else:
264 -                results = sparql_query(self.query % {'uri': uri}, self.endpoint)
265 -                types = set([r['type']['value'] for r in results])
266 -                if not len(types.intersection(self.accepted_types)):
267 -                    seen_uris[uri] = False
268 -                else:
269 -                    seen_uris[uri] = True
270 -                    filtered_named_entities.append((uri, p, t))
271 -        return filtered_named_entities
272 -
273 -
274 -class NerdyDisambiguationWordParts(object):
275 -    """ Disambiguate named entities based on the words parts.
276 -    E.g.:
277 -          'toto tutu': 'http://example.com/toto_tutu',
278 -          'toto': 'http://example.com/toto'
279 -
280 -          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
281 -          by 'http://example.com/toto_tutu'
282 -    """
283 -    def __call__(self, named_entities):
284 -        # Create parts dictionnary
285 -        parts = {}
286 -        for uri, peid, token in named_entities:
287 -            if ' ' in token.word:
288 -                for part in token.word.split(' '):
289 -                    parts[part.lower()] = uri
290 -        # Replace named entities
291 -        filtered_named_entities = []
292 -        for uri, peid, token in named_entities:
293 -            if token.word in parts:
294 -                # Change URI
295 -                uri = parts[token.word]
296 -            filtered_named_entities.append((uri, peid, token))
297 -        return filtered_named_entities
298 -
299 -
300 -class NerdyReplacementRulesFilter(object):
301 -    """ Allow to define replacement rules for Named Entities
302 -    """
303 -    def __init__(self,rules):
304 -        self.rules = rules
305 -
306 -    def __call__(self, named_entities):
307 -        filtered_named_entities = []
308 -        for uri, peid, token in named_entities:
309 -            uri = self.rules.get(uri, uri)
310 -            filtered_named_entities.append((uri, peid, token))
311 -        return filtered_named_entities
312 -
313 -
314 -###############################################################################
315 -### NER PROCESS ###############################################################
316 -###############################################################################
317 -class NerdyProcess(object):
318 -    """ High-level process for Named Entities Recognition
319 -    """
320 -
321 -    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
322 -        """ Initialise the class.
323 -
324 -        :tokenizer: an instance of tokenizer
325 -        """
326 -        self.ner_sources = list(ner_sources)
327 -        self.preprocessors = preprocessors or []
328 -        self.filters = filters or []
329 -        self.unique = unique
330 -
331 -    def add_ner_source(self, process):
332 -        """ Add a ner process
333 -        """
334 -        self.ner_sources.append(process)
335 -
336 -    def add_preprocessors(self, preprocessor):
337 -        """ Add a preprocessor
338 -        """
339 -        self.preprocessors.append(preprocessor)
340 -
341 -    def add_filters(self, filter):
342 -        """ Add a filter
343 -        """
344 -        self.filters.append(filter)
345 -
346 -    def process_text(self, text):
347 -        """ High level function for analyzing a text
348 -        """
349 -        tokenizer = RichStringTokenizer(text)
350 -        return self.recognize_tokens(tokenizer)
351 -
352 -    def recognize_tokens(self, tokens):
353 -        """ Recognize Named Entities from a tokenizer or
354 -        an iterator yielding tokens.
355 -        """
356 -        last_stop = 0
357 -        named_entities = []
358 -        for token in tokens:
359 -            if token.start < last_stop:
360 -                continue # this token overlaps with a previous match
361 -            word = token.word
362 -            # Applies preprocessors
363 -            # XXX Preprocessors may be sources dependant
364 -            for preprocessor in self.preprocessors:
365 -                token = preprocessor(token)
366 -                if not token:
367 -                    break
368 -            if not token:
369 -                continue
370 -            recognized = False
371 -            for process in self.ner_sources:
372 -                for uri in process.recognize_token(token):
373 -                    named_entities.append((uri, process.name, token))
374 -                    recognized = True
375 -                    last_stop = token.end
376 -                    if self.unique:
377 -                        break
378 -                if recognized and self.unique:
379 -                    break
380 -        # XXX Postprocess/filters may be sources dependant
381 -        return self.postprocess(named_entities)
382 -
383 -    def postprocess(self, named_entities):
384 -        """ Postprocess the results by applying filters """
385 -        for filter in self.filters:
386 -            named_entities = filter(named_entities)
387 -        return named_entities
388 -
389 -
390 -###############################################################################
391 -### NER RELATIONS PROCESS #####################################################
392 -###############################################################################
393 -class NerdyRelationsProcess(object):
394 -    """ Process for building simple relation from named entities results
395 -    """
396 -    pass
diff --git a/named_entities/named_entities.py b/named_entities/named_entities.py
@@ -0,0 +1,396 @@
397 +# -*- coding: utf-8 -*-
398 +""" Core functions for Named Entities Recognition.
399 +"""
400 +from nerdy.tokenizer import RichStringTokenizer, Token
401 +from nerdy.dataio import sparql_query, rql_url_query, rql_appid_query
402 +from nerdy.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
403 +
404 +STOPWORDS = {'fr': FRENCH_STOPWORDS,
405 +             'en': ENGLISH_STOPWORDS}
406 +
407 +# XXX Add SQL source ?
408 +# XXX NER preprocessor
409 +
410 +###############################################################################
411 +### NER SOURCE ################################################################
412 +###############################################################################
413 +class AbstractNerdySource(object):
414 +    """ High-level source for Named Entities Recognition
415 +    """
416 +
417 +    def __init__(self, query, endpoint, name=None, use_cache=True, preprocessors=None):
418 +        """ Initialise the class.
419 +        """
420 +        self.query = query
421 +        self.endpoint = endpoint
422 +        self.name = name
423 +        self.preprocessors = preprocessors or []
424 +        self.use_cache = use_cache
425 +        self._recognized_cache = {}
426 +
427 +    def add_preprocessors(self, preprocessor):
428 +        """ Add a preprocessor
429 +        """
430 +        self.preprocessors.append(preprocessor)
431 +
432 +    def recognize_token(self, token):
433 +        """ Recognize a token
434 +        """
435 +        # Applies source specific preprocessors
436 +        for preprocessor in self.preprocessors:
437 +            token = preprocessor(token)
438 +            if not token:
439 +                return []
440 +        if self.use_cache and token.word in self._recognized_cache:
441 +            return self._recognized_cache[token.word]
442 +        uris = self.query_word(token.word) if token.word else []
443 +        if self.use_cache:
444 +            self._recognized_cache[token.word] = uris
445 +        return uris
446 +
447 +    def query_word(self, word):
448 +        """ Query a word for a Named Entities Recognition process
449 +        """
450 +        raise NotImplementedError
451 +
452 +
453 +class NerdySourceLexical(AbstractNerdySource):
454 +    """ Source based on a (pre-computed) dictionnary of words (token, uri)
455 +    """
456 +    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
457 +        self.lexicon = lexicon
458 +        self.name = name
459 +        self.preprocessors = preprocessors or []
460 +        self.use_cache = use_cache
461 +        self._recognized_cache = {}
462 +
463 +    def query_word(self, word):
464 +        uri = self.lexicon.get(word)
465 +        return [uri,] if uri else []
466 +
467 +
468 +class NerdySourceLocalRql(AbstractNerdySource):
469 +    """ High-level source for Named Entities Recognition
470 +    Local RQL version
471 +    """
472 +
473 +    def __init__(self, query, session, name=None, use_cache=True, preprocessors=None):
474 +        """ Initialise the class.
475 +        """
476 +        self.query = query
477 +        self.session = session
478 +        self.name = name
479 +        self.preprocessors = preprocessors or []
480 +        self.use_cache = use_cache
481 +        self._recognized_cache = {}
482 +
483 +    def query_word(self, word):
484 +        """ Query a word for a Named Entities Recognition process
485 +        """
486 +        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
487 +
488 +
489 +class NerdySourceAppidRql(AbstractNerdySource):
490 +    """ High-level source for Named Entities Recognition
491 +    Appid RQL version
492 +    """
493 +
494 +    def query_word(self, word):
495 +        """ Query a word for a Named Entities Recognition process
496 +        """
497 +        return [r[0] for r in rql_appid_query(self.query, self.endpoint, word=word)]
498 +
499 +
500 +class NerdySourceUrlRql(AbstractNerdySource):
501 +    """ High-level source for Named Entities Recognition
502 +    Url RQL version
503 +    """
504 +
505 +    def query_word(self, word):
506 +        """ Query a word for a Named Entities Recognition process
507 +        """
508 +        return [r[0] for r in rql_url_query(self.query % {'word': word}, self.endpoint)]
509 +
510 +
511 +class NerdySourceSparql(AbstractNerdySource):
512 +    """ High-level source for Named Entities Recognition
513 +    SPARQL version
514 +
515 +   >>> from nerdy.core import NerdySourceSparql
516 +   >>> ner_source = NerdySourceSparql('''SELECT ?uri
517 +                                         WHERE{
518 +                                         ?uri rdfs:label "%(word)s"@en}''',
519 +			                 'http://dbpedia.org/sparql')
520 +   >>> print ner_source.recognize_token('Victor Hugo')
521 +		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
522 +		     'http://dbpedia.org/resource/Victor_Hugo',
523 +		     'http://dbpedia.org/class/yago/VictorHugo',
524 +		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
525 +		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
526 +		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
527 +
528 +    """
529 +
530 +    def query_word(self, word):
531 +        """ Query a word for a Named Entities Recognition process
532 +        """
533 +        return [r['uri']['value'] for r in sparql_query(self.query % {'word': word}, self.endpoint)]
534 +
535 +
536 +###############################################################################
537 +### NER PREPROCESSORS #########################################################
538 +###############################################################################
539 +class AbstractNerdyPreprocessor(object):
540 +    """ Preprocessor
541 +    """
542 +
543 +    def __call__(self, token):
544 +        raise NotImplementedError
545 +
546 +
547 +class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
548 +    """ Remove token based on the size of the word
549 +    """
550 +    def __init__(self, min_size=None, max_size=None):
551 +        self.min_size = min_size
552 +        self.max_size = max_size
553 +
554 +    def __call__(self, token):
555 +        if ((self.min_size and len(token.word)<self.min_size)
556 +            or (self.max_size and len(token.word)>self.max_size)):
557 +            return None
558 +        return token
559 +
560 +
561 +class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
562 +    """ Remove token with word in lower case
563 +    """
564 +
565 +    def __call__(self, token):
566 +        return None if token.word.islower() else token
567 +
568 +
569 +class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
570 +    """ Lower the first word of each sentence if it is a stopword.
571 +    """
572 +    def __init__(self, lang='en'):
573 +        self.lang = lang
574 +
575 +    def __call__(self, token):
576 +        if (token.start == token.sentence.start and
577 +            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
578 +            word = token.word[0].lower() + token.word[1:]
579 +            return Token(word, token.start, token.end, token.sentence)
580 +        return token
581 +
582 +
583 +class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
584 +    """ Remove stopwords
585 +    """
586 +    def __init__(self, split_words=False, lang='en'):
587 +        self.split_words = split_words
588 +        self.lang = lang
589 +
590 +    def __call__(self, token):
591 +        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
592 +        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
593 +            return None
594 +        if not self.split_words and token.word.lower() in stopwords:
595 +            return None
596 +        return token
597 +
598 +
599 +class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
600 +    """ Cleanup hashtag
601 +    """
602 +    def __call__(self, token):
603 +        if token.word.startswith('@'):
604 +            # XXX Split capitalize letter ?
605 +            # @BarackObama -> Barack Obama
606 +            word = token.word[1:].replace('_', ' ')
607 +            return Token(word, token.start, token.end, token.sentence)
608 +        return token
609 +
610 +
611 +###############################################################################
612 +### NER FILTERS ###############################################################
613 +###############################################################################
614 +class AbstractNerdyFilter(object):
615 +    """ A filter used for cleaning named entities results
616 +    """
617 +
618 +    def __call__(self, named_entities):
619 +        raise NotImplementedError
620 +
621 +
622 +class NerdyOccurenceFilter(object):
623 +    """ A filter based on the number of occurence of
624 +    named entities in the results.
625 +    """
626 +    def __init__(self, min_occ=None, max_occ=None):
627 +        self.min_occ = min_occ
628 +        self.max_occ = max_occ
629 +
630 +    def __call__(self, named_entities):
631 +        uris = [u for u, p, t in named_entities]
632 +        counts = dict([(u, uris.count(u)) for u in set(uris)])
633 +        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
634 +                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
635 +
636 +
637 +class NerdyRDFTypeFilter(object):
638 +    """ A filter based on the RDF type on entity
639 +    E.g.
640 +
641 +    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
642 +                                ('http://schema.org/Place',
643 +                                'http://dbpedia.org/ontology/Agent',
644 +                                'http://dbpedia.org/ontology/Place'))
645 +
646 +    """
647 +    def __init__(self, endpoint, accepted_types):
648 +        self.endpoint = endpoint
649 +        self.accepted_types = accepted_types
650 +        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
651 +
652 +    def __call__(self, named_entities):
653 +        filtered_named_entities = []
654 +        seen_uris = {}
655 +        for uri, p, t in named_entities:
656 +            if uri in seen_uris:
657 +                if seen_uris[uri]:
658 +                    filtered_named_entities.append((uri, p, t))
659 +            else:
660 +                results = sparql_query(self.query % {'uri': uri}, self.endpoint)
661 +                types = set([r['type']['value'] for r in results])
662 +                if not len(types.intersection(self.accepted_types)):
663 +                    seen_uris[uri] = False
664 +                else:
665 +                    seen_uris[uri] = True
666 +                    filtered_named_entities.append((uri, p, t))
667 +        return filtered_named_entities
668 +
669 +
670 +class NerdyDisambiguationWordParts(object):
671 +    """ Disambiguate named entities based on the words parts.
672 +    E.g.:
673 +          'toto tutu': 'http://example.com/toto_tutu',
674 +          'toto': 'http://example.com/toto'
675 +
676 +          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
677 +          by 'http://example.com/toto_tutu'
678 +    """
679 +    def __call__(self, named_entities):
680 +        # Create parts dictionnary
681 +        parts = {}
682 +        for uri, peid, token in named_entities:
683 +            if ' ' in token.word:
684 +                for part in token.word.split(' '):
685 +                    parts[part.lower()] = uri
686 +        # Replace named entities
687 +        filtered_named_entities = []
688 +        for uri, peid, token in named_entities:
689 +            if token.word in parts:
690 +                # Change URI
691 +                uri = parts[token.word]
692 +            filtered_named_entities.append((uri, peid, token))
693 +        return filtered_named_entities
694 +
695 +
696 +class NerdyReplacementRulesFilter(object):
697 +    """ Allow to define replacement rules for Named Entities
698 +    """
699 +    def __init__(self,rules):
700 +        self.rules = rules
701 +
702 +    def __call__(self, named_entities):
703 +        filtered_named_entities = []
704 +        for uri, peid, token in named_entities:
705 +            uri = self.rules.get(uri, uri)
706 +            filtered_named_entities.append((uri, peid, token))
707 +        return filtered_named_entities
708 +
709 +
710 +###############################################################################
711 +### NER PROCESS ###############################################################
712 +###############################################################################
713 +class NerdyProcess(object):
714 +    """ High-level process for Named Entities Recognition
715 +    """
716 +
717 +    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
718 +        """ Initialise the class.
719 +
720 +        :tokenizer: an instance of tokenizer
721 +        """
722 +        self.ner_sources = list(ner_sources)
723 +        self.preprocessors = preprocessors or []
724 +        self.filters = filters or []
725 +        self.unique = unique
726 +
727 +    def add_ner_source(self, process):
728 +        """ Add a ner process
729 +        """
730 +        self.ner_sources.append(process)
731 +
732 +    def add_preprocessors(self, preprocessor):
733 +        """ Add a preprocessor
734 +        """
735 +        self.preprocessors.append(preprocessor)
736 +
737 +    def add_filters(self, filter):
738 +        """ Add a filter
739 +        """
740 +        self.filters.append(filter)
741 +
742 +    def process_text(self, text):
743 +        """ High level function for analyzing a text
744 +        """
745 +        tokenizer = RichStringTokenizer(text)
746 +        return self.recognize_tokens(tokenizer)
747 +
748 +    def recognize_tokens(self, tokens):
749 +        """ Recognize Named Entities from a tokenizer or
750 +        an iterator yielding tokens.
751 +        """
752 +        last_stop = 0
753 +        named_entities = []
754 +        for token in tokens:
755 +            if token.start < last_stop:
756 +                continue # this token overlaps with a previous match
757 +            word = token.word
758 +            # Applies preprocessors
759 +            # XXX Preprocessors may be sources dependant
760 +            for preprocessor in self.preprocessors:
761 +                token = preprocessor(token)
762 +                if not token:
763 +                    break
764 +            if not token:
765 +                continue
766 +            recognized = False
767 +            for process in self.ner_sources:
768 +                for uri in process.recognize_token(token):
769 +                    named_entities.append((uri, process.name, token))
770 +                    recognized = True
771 +                    last_stop = token.end
772 +                    if self.unique:
773 +                        break
774 +                if recognized and self.unique:
775 +                    break
776 +        # XXX Postprocess/filters may be sources dependant
777 +        return self.postprocess(named_entities)
778 +
779 +    def postprocess(self, named_entities):
780 +        """ Postprocess the results by applying filters """
781 +        for filter in self.filters:
782 +            named_entities = filter(named_entities)
783 +        return named_entities
784 +
785 +
786 +###############################################################################
787 +### NER RELATIONS PROCESS #####################################################
788 +###############################################################################
789 +class NerdyRelationsProcess(object):
790 +    """ Process for building simple relation from named entities results
791 +    """
792 +    pass
diff --git a/test/test_core.py b/test/test_core.py
@@ -1,225 +0,0 @@
793 -# -*- coding:utf-8 -*-
794 -#
795 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
796 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
797 -#
798 -# This program is free software: you can redistribute it and/or modify it under
799 -# the terms of the GNU Lesser General Public License as published by the Free
800 -# Software Foundation, either version 2.1 of the License, or (at your option)
801 -# any later version.
802 -#
803 -# This program is distributed in the hope that it will be useful, but WITHOUT
804 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
805 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
806 -# details.
807 -#
808 -# You should have received a copy of the GNU Lesser General Public License along
809 -# with this program. If not, see <http://www.gnu.org/licenses/>.
810 -import unittest2
811 -
812 -from nerdy import core
813 -from nerdy.tokenizer import Token, Sentence
814 -
815 -
816 -class CoreTest(unittest2.TestCase):
817 -    """ Test of core """
818 -
819 -    def test_lexical_source(self):
820 -        """ Test lexical source """
821 -        lexicon = {'everyone': 'http://example.com/everyone',
822 -                   'me': 'http://example.com/me'}
823 -        source = core.NerdySourceLexical(lexicon)
824 -        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
825 -        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
826 -        self.assertEqual(source.query_word('me everyone'), [])
827 -        self.assertEqual(source.query_word('toto'), [])
828 -        # Token
829 -        token = Token('me', 0, 2, None)
830 -        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
831 -        token = Token('ma', 0, 2, None)
832 -        self.assertEqual(source.recognize_token(token), [])
833 -
834 -    def test_rql_source(self):
835 -        """ Test rql source """
836 -        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
837 -                                       'http://www.cubicweb.org')
838 -        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
839 -
840 -    def test_sparql_source(self):
841 -        """ Test sparql source """
842 -        source = core.NerdySourceSparql(u'''SELECT ?uri
843 -                                            WHERE{
844 -                                            ?uri rdfs:label "Python"@en .
845 -                                            ?uri rdf:type ?type}''',
846 -                                        u'http://dbpedia.org/sparql')
847 -        self.assertEqual(source.query_word('cubicweb'),
848 -                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
849 -                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
850 -
851 -    def test_nerdy_process(self):
852 -        """ Test nerdy process """
853 -        text = 'Hello everyone, this is   me speaking. And me.'
854 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
855 -                                          'me': 'http://example.com/me'})
856 -        nerdy = core.NerdyProcess((source,))
857 -        named_entities = nerdy.process_text(text)
858 -        self.assertEqual(named_entities,
859 -                         [('http://example.com/everyone', None,
860 -                           Token(word='everyone', start=6, end=14,
861 -                                           sentence=Sentence(indice=0, start=0, end=38))),
862 -                          ('http://example.com/me', None,
863 -                           Token(word='me', start=26, end=28,
864 -                                           sentence=Sentence(indice=0, start=0, end=38))),
865 -                          ('http://example.com/me', None,
866 -                           Token(word='me', start=43, end=45,
867 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
868 -
869 -    def test_nerdy_process_multisources(self):
870 -        """ Test nerdy process """
871 -        text = 'Hello everyone, this is   me speaking. And me.'
872 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
873 -                                          'me': 'http://example.com/me'})
874 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
875 -        # Two sources, not unique
876 -        nerdy = core.NerdyProcess((source1, source2))
877 -        named_entities = nerdy.process_text(text)
878 -        self.assertEqual(named_entities,
879 -                         [('http://example.com/everyone', None,
880 -                           Token(word='everyone', start=6, end=14,
881 -                                           sentence=Sentence(indice=0, start=0, end=38))),
882 -                          ('http://example.com/me', None,
883 -                           Token(word='me', start=26, end=28,
884 -                                           sentence=Sentence(indice=0, start=0, end=38))),
885 -                          ('http://example2.com/me', None,
886 -                           Token(word='me', start=26, end=28,
887 -                                           sentence=Sentence(indice=0, start=0, end=38))),
888 -                          ('http://example.com/me', None,
889 -                           Token(word='me', start=43, end=45,
890 -                                           sentence=Sentence(indice=1, start=38, end=46))),
891 -                          ('http://example2.com/me', None,
892 -                           Token(word='me', start=43, end=45,
893 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
894 -        # Two sources, unique
895 -        nerdy = core.NerdyProcess((source1, source2), unique=True)
896 -        named_entities = nerdy.process_text(text)
897 -        self.assertEqual(named_entities,
898 -                         [('http://example.com/everyone', None,
899 -                           Token(word='everyone', start=6, end=14,
900 -                                           sentence=Sentence(indice=0, start=0, end=38))),
901 -                          ('http://example.com/me', None,
902 -                           Token(word='me', start=26, end=28,
903 -                                           sentence=Sentence(indice=0, start=0, end=38))),
904 -                          ('http://example.com/me', None,
905 -                           Token(word='me', start=43, end=45,
906 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
907 -        # Two sources inversed, unique
908 -        nerdy = core.NerdyProcess((source2, source1), unique=True)
909 -        named_entities = nerdy.process_text(text)
910 -        self.assertEqual(named_entities,
911 -                         [('http://example.com/everyone', None,
912 -                           Token(word='everyone', start=6, end=14,
913 -                                           sentence=Sentence(indice=0, start=0, end=38))),
914 -                          ('http://example2.com/me', None,
915 -                           Token(word='me', start=26, end=28,
916 -                                           sentence=Sentence(indice=0, start=0, end=38))),
917 -                          ('http://example2.com/me', None,
918 -                           Token(word='me', start=43, end=45,
919 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
920 -
921 -    def test_nerdy_process_add_sources(self):
922 -        """ Test nerdy process """
923 -        text = 'Hello everyone, this is   me speaking. And me.'
924 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
925 -                                          'me': 'http://example.com/me'})
926 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
927 -        nerdy = core.NerdyProcess((source1,))
928 -        named_entities = nerdy.process_text(text)
929 -        self.assertEqual(named_entities,
930 -                         [('http://example.com/everyone', None,
931 -                           Token(word='everyone', start=6, end=14,
932 -                                           sentence=Sentence(indice=0, start=0, end=38))),
933 -                          ('http://example.com/me', None,
934 -                           Token(word='me', start=26, end=28,
935 -                                           sentence=Sentence(indice=0, start=0, end=38))),
936 -                          ('http://example.com/me', None,
937 -                           Token(word='me', start=43, end=45,
938 -                                           sentence=Sentence(indice=1, start=38, end=46))),])
939 -        # Two sources, not unique
940 -        nerdy.add_ner_source(source2)
941 -        named_entities = nerdy.process_text(text)
942 -        self.assertEqual(named_entities,
943 -                         [('http://example.com/everyone', None,
944 -                           Token(word='everyone', start=6, end=14,
945 -                                           sentence=Sentence(indice=0, start=0, end=38))),
946 -                          ('http://example.com/me', None,
947 -                           Token(word='me', start=26, end=28,
948 -                                           sentence=Sentence(indice=0, start=0, end=38))),
949 -                          ('http://example2.com/me', None,
950 -                           Token(word='me', start=26, end=28,
951 -                                           sentence=Sentence(indice=0, start=0, end=38))),
952 -                          ('http://example.com/me', None,
953 -                           Token(word='me', start=43, end=45,
954 -                                           sentence=Sentence(indice=1, start=38, end=46))),
955 -                          ('http://example2.com/me', None,
956 -                           Token(word='me', start=43, end=45,
957 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
958 -
959 -    def test_nerdy_process_preprocess(self):
960 -        """ Test nerdy process """
961 -        text = 'Hello Toto, this is   me speaking. And me.'
962 -        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
963 -                                          'me': 'http://example.com/me'})
964 -        preprocessor = core.NerdyStopwordsFilterPreprocessor()
965 -        nerdy = core.NerdyProcess((source,),
966 -                                  preprocessors=(preprocessor,))
967 -        named_entities = nerdy.process_text(text)
968 -        self.assertEqual(named_entities, [('http://example.com/toto', None,
969 -                                           Token(word='Toto', start=6, end=10,
970 -                                                 sentence=Sentence(indice=0, start=0, end=34)))])
971 -
972 -    def test_nerdy_process_add_preprocess(self):
973 -        """ Test nerdy process """
974 -        text = 'Hello Toto, this is   me speaking. And me.'
975 -        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
976 -                                          'me': 'http://example.com/me'})
977 -        preprocessor = core.NerdyStopwordsFilterPreprocessor()
978 -        nerdy = core.NerdyProcess((source,),)
979 -        named_entities = nerdy.process_text(text)
980 -        self.assertEqual(named_entities,
981 -                         [('http://example.com/toto', None,
982 -                           Token(word='Toto', start=6, end=10,
983 -                                 sentence=Sentence(indice=0, start=0, end=34))),
984 -                          ('http://example.com/me', None,
985 -                           Token(word='me', start=22, end=24,
986 -                                 sentence=Sentence(indice=0, start=0, end=34))),
987 -                          ('http://example.com/me', None,
988 -                           Token(word='me', start=39, end=41,
989 -                                 sentence=Sentence(indice=1, start=34, end=42)))])
990 -        nerdy.add_preprocessors(preprocessor)
991 -        named_entities = nerdy.process_text(text)
992 -        self.assertEqual(named_entities, [('http://example.com/toto', None,
993 -                                           Token(word='Toto', start=6, end=10,
994 -                                                 sentence=Sentence(indice=0, start=0, end=34)))])
995 -
996 -    def test_nerdy_process_chained_word(self):
997 -        """ Test nerdy process """
998 -        text = 'Hello everyone me, this is   me speaking. And me.'
999 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1000 -                                          'everyone me': 'http://example.com/everyone_me',
1001 -                                          'me': 'http://example.com/me'})
1002 -        nerdy = core.NerdyProcess((source,))
1003 -        named_entities = nerdy.process_text(text)
1004 -        self.assertEqual(named_entities,
1005 -                         [('http://example.com/everyone_me', None,
1006 -                           Token(word='everyone me', start=6, end=17,
1007 -                                 sentence=Sentence(indice=0, start=0, end=41))),
1008 -                          ('http://example.com/me', None,
1009 -                           Token(word='me', start=29, end=31,
1010 -                                 sentence=Sentence(indice=0, start=0, end=41))),
1011 -                          ('http://example.com/me', None,
1012 -                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
1013 -
1014 -
1015 -if __name__ == '__main__':
1016 -    unittest2.main()
1017 -
diff --git a/test/test_named_entities.py b/test/test_named_entities.py
@@ -0,0 +1,225 @@
1018 +# -*- coding:utf-8 -*-
1019 +#
1020 +# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1021 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1022 +#
1023 +# This program is free software: you can redistribute it and/or modify it under
1024 +# the terms of the GNU Lesser General Public License as published by the Free
1025 +# Software Foundation, either version 2.1 of the License, or (at your option)
1026 +# any later version.
1027 +#
1028 +# This program is distributed in the hope that it will be useful, but WITHOUT
1029 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1030 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1031 +# details.
1032 +#
1033 +# You should have received a copy of the GNU Lesser General Public License along
1034 +# with this program. If not, see <http://www.gnu.org/licenses/>.
1035 +import unittest2
1036 +
1037 +from nerdy import core
1038 +from nerdy.tokenizer import Token, Sentence
1039 +
1040 +
1041 +class CoreTest(unittest2.TestCase):
1042 +    """ Test of core """
1043 +
1044 +    def test_lexical_source(self):
1045 +        """ Test lexical source """
1046 +        lexicon = {'everyone': 'http://example.com/everyone',
1047 +                   'me': 'http://example.com/me'}
1048 +        source = core.NerdySourceLexical(lexicon)
1049 +        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
1050 +        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
1051 +        self.assertEqual(source.query_word('me everyone'), [])
1052 +        self.assertEqual(source.query_word('toto'), [])
1053 +        # Token
1054 +        token = Token('me', 0, 2, None)
1055 +        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
1056 +        token = Token('ma', 0, 2, None)
1057 +        self.assertEqual(source.recognize_token(token), [])
1058 +
1059 +    def test_rql_source(self):
1060 +        """ Test rql source """
1061 +        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
1062 +                                       'http://www.cubicweb.org')
1063 +        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
1064 +
1065 +    def test_sparql_source(self):
1066 +        """ Test sparql source """
1067 +        source = core.NerdySourceSparql(u'''SELECT ?uri
1068 +                                            WHERE{
1069 +                                            ?uri rdfs:label "Python"@en .
1070 +                                            ?uri rdf:type ?type}''',
1071 +                                        u'http://dbpedia.org/sparql')
1072 +        self.assertEqual(source.query_word('cubicweb'),
1073 +                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
1074 +                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
1075 +
1076 +    def test_nerdy_process(self):
1077 +        """ Test nerdy process """
1078 +        text = 'Hello everyone, this is   me speaking. And me.'
1079 +        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1080 +                                          'me': 'http://example.com/me'})
1081 +        nerdy = core.NerdyProcess((source,))
1082 +        named_entities = nerdy.process_text(text)
1083 +        self.assertEqual(named_entities,
1084 +                         [('http://example.com/everyone', None,
1085 +                           Token(word='everyone', start=6, end=14,
1086 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1087 +                          ('http://example.com/me', None,
1088 +                           Token(word='me', start=26, end=28,
1089 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1090 +                          ('http://example.com/me', None,
1091 +                           Token(word='me', start=43, end=45,
1092 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
1093 +
1094 +    def test_nerdy_process_multisources(self):
1095 +        """ Test nerdy process """
1096 +        text = 'Hello everyone, this is   me speaking. And me.'
1097 +        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1098 +                                          'me': 'http://example.com/me'})
1099 +        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
1100 +        # Two sources, not unique
1101 +        nerdy = core.NerdyProcess((source1, source2))
1102 +        named_entities = nerdy.process_text(text)
1103 +        self.assertEqual(named_entities,
1104 +                         [('http://example.com/everyone', None,
1105 +                           Token(word='everyone', start=6, end=14,
1106 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1107 +                          ('http://example.com/me', None,
1108 +                           Token(word='me', start=26, end=28,
1109 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1110 +                          ('http://example2.com/me', None,
1111 +                           Token(word='me', start=26, end=28,
1112 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1113 +                          ('http://example.com/me', None,
1114 +                           Token(word='me', start=43, end=45,
1115 +                                           sentence=Sentence(indice=1, start=38, end=46))),
1116 +                          ('http://example2.com/me', None,
1117 +                           Token(word='me', start=43, end=45,
1118 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
1119 +        # Two sources, unique
1120 +        nerdy = core.NerdyProcess((source1, source2), unique=True)
1121 +        named_entities = nerdy.process_text(text)
1122 +        self.assertEqual(named_entities,
1123 +                         [('http://example.com/everyone', None,
1124 +                           Token(word='everyone', start=6, end=14,
1125 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1126 +                          ('http://example.com/me', None,
1127 +                           Token(word='me', start=26, end=28,
1128 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1129 +                          ('http://example.com/me', None,
1130 +                           Token(word='me', start=43, end=45,
1131 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
1132 +        # Two sources inversed, unique
1133 +        nerdy = core.NerdyProcess((source2, source1), unique=True)
1134 +        named_entities = nerdy.process_text(text)
1135 +        self.assertEqual(named_entities,
1136 +                         [('http://example.com/everyone', None,
1137 +                           Token(word='everyone', start=6, end=14,
1138 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1139 +                          ('http://example2.com/me', None,
1140 +                           Token(word='me', start=26, end=28,
1141 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1142 +                          ('http://example2.com/me', None,
1143 +                           Token(word='me', start=43, end=45,
1144 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
1145 +
1146 +    def test_nerdy_process_add_sources(self):
1147 +        """ Test nerdy process """
1148 +        text = 'Hello everyone, this is   me speaking. And me.'
1149 +        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1150 +                                          'me': 'http://example.com/me'})
1151 +        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
1152 +        nerdy = core.NerdyProcess((source1,))
1153 +        named_entities = nerdy.process_text(text)
1154 +        self.assertEqual(named_entities,
1155 +                         [('http://example.com/everyone', None,
1156 +                           Token(word='everyone', start=6, end=14,
1157 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1158 +                          ('http://example.com/me', None,
1159 +                           Token(word='me', start=26, end=28,
1160 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1161 +                          ('http://example.com/me', None,
1162 +                           Token(word='me', start=43, end=45,
1163 +                                           sentence=Sentence(indice=1, start=38, end=46))),])
1164 +        # Two sources, not unique
1165 +        nerdy.add_ner_source(source2)
1166 +        named_entities = nerdy.process_text(text)
1167 +        self.assertEqual(named_entities,
1168 +                         [('http://example.com/everyone', None,
1169 +                           Token(word='everyone', start=6, end=14,
1170 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1171 +                          ('http://example.com/me', None,
1172 +                           Token(word='me', start=26, end=28,
1173 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1174 +                          ('http://example2.com/me', None,
1175 +                           Token(word='me', start=26, end=28,
1176 +                                           sentence=Sentence(indice=0, start=0, end=38))),
1177 +                          ('http://example.com/me', None,
1178 +                           Token(word='me', start=43, end=45,
1179 +                                           sentence=Sentence(indice=1, start=38, end=46))),
1180 +                          ('http://example2.com/me', None,
1181 +                           Token(word='me', start=43, end=45,
1182 +                                           sentence=Sentence(indice=1, start=38, end=46)))])
1183 +
1184 +    def test_nerdy_process_preprocess(self):
1185 +        """ Test nerdy process """
1186 +        text = 'Hello Toto, this is   me speaking. And me.'
1187 +        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
1188 +                                          'me': 'http://example.com/me'})
1189 +        preprocessor = core.NerdyStopwordsFilterPreprocessor()
1190 +        nerdy = core.NerdyProcess((source,),
1191 +                                  preprocessors=(preprocessor,))
1192 +        named_entities = nerdy.process_text(text)
1193 +        self.assertEqual(named_entities, [('http://example.com/toto', None,
1194 +                                           Token(word='Toto', start=6, end=10,
1195 +                                                 sentence=Sentence(indice=0, start=0, end=34)))])
1196 +
1197 +    def test_nerdy_process_add_preprocess(self):
1198 +        """ Test nerdy process """
1199 +        text = 'Hello Toto, this is   me speaking. And me.'
1200 +        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
1201 +                                          'me': 'http://example.com/me'})
1202 +        preprocessor = core.NerdyStopwordsFilterPreprocessor()
1203 +        nerdy = core.NerdyProcess((source,),)
1204 +        named_entities = nerdy.process_text(text)
1205 +        self.assertEqual(named_entities,
1206 +                         [('http://example.com/toto', None,
1207 +                           Token(word='Toto', start=6, end=10,
1208 +                                 sentence=Sentence(indice=0, start=0, end=34))),
1209 +                          ('http://example.com/me', None,
1210 +                           Token(word='me', start=22, end=24,
1211 +                                 sentence=Sentence(indice=0, start=0, end=34))),
1212 +                          ('http://example.com/me', None,
1213 +                           Token(word='me', start=39, end=41,
1214 +                                 sentence=Sentence(indice=1, start=34, end=42)))])
1215 +        nerdy.add_preprocessors(preprocessor)
1216 +        named_entities = nerdy.process_text(text)
1217 +        self.assertEqual(named_entities, [('http://example.com/toto', None,
1218 +                                           Token(word='Toto', start=6, end=10,
1219 +                                                 sentence=Sentence(indice=0, start=0, end=34)))])
1220 +
1221 +    def test_nerdy_process_chained_word(self):
1222 +        """ Test nerdy process """
1223 +        text = 'Hello everyone me, this is   me speaking. And me.'
1224 +        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
1225 +                                          'everyone me': 'http://example.com/everyone_me',
1226 +                                          'me': 'http://example.com/me'})
1227 +        nerdy = core.NerdyProcess((source,))
1228 +        named_entities = nerdy.process_text(text)
1229 +        self.assertEqual(named_entities,
1230 +                         [('http://example.com/everyone_me', None,
1231 +                           Token(word='everyone me', start=6, end=17,
1232 +                                 sentence=Sentence(indice=0, start=0, end=41))),
1233 +                          ('http://example.com/me', None,
1234 +                           Token(word='me', start=29, end=31,
1235 +                                 sentence=Sentence(indice=0, start=0, end=41))),
1236 +                          ('http://example.com/me', None,
1237 +                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
1238 +
1239 +
1240 +if __name__ == '__main__':
1241 +    unittest2.main()
1242 +