[pkginfo] Rename ner in named_entities and update pkginfo, related to #187461

authorvincent.michel@logilab.fr
changeset36951167576c
branchdefault
phasedraft
hiddenyes
parent revision#b323882735ec [utils] Create an utils folder, related to #187461
child revision#00d352769ba0 [ner] Cleanup Nerdy, related to #187461
files modified by this revision
__pkginfo__.py
named_entities/__init__.py
named_entities/core.py
named_entities/tokenizer.py
ner/__init__.py
ner/core.py
ner/tokenizer.py
# HG changeset patch
# User vincent.michel@logilab.fr
# Date 1387464304 0
# Thu Dec 19 14:45:04 2013 +0000
# Node ID 36951167576cdf576b711e1005726ac054611299
# Parent b323882735ecefe2f96381e3500fd3c91eef217d
[pkginfo] Rename ner in named_entities and update pkginfo, related to #187461

diff --git a/__pkginfo__.py b/__pkginfo__.py
@@ -33,9 +33,9 @@
1  author_email = "contact@logilab.fr"
2 
3 
4  from os.path import join
5  scripts = [join('bin', 'pytest')]
6 -include_dirs = [join('test', 'data')]
7 +include_dirs = [join('test', 'data', 'utils', 'named_entities', 'record_linkage', 'reference_data', 'examples')]
8 
9  if sys.version_info < (2, 7):
10      install_requires = ['unittest2 >= 0.5.1']
diff --git a/named_entities/__init__.py b/named_entities/__init__.py
diff --git a/named_entities/core.py b/named_entities/core.py
@@ -0,0 +1,396 @@
11 +# -*- coding: utf-8 -*-
12 +""" Core functions for Named Entities Recognition.
13 +"""
14 +from nerdy.tokenizer import RichStringTokenizer, Token
15 +from nerdy.dataio import sparql_query, rql_url_query, rql_appid_query
16 +from nerdy.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
17 +
18 +STOPWORDS = {'fr': FRENCH_STOPWORDS,
19 +             'en': ENGLISH_STOPWORDS}
20 +
21 +# XXX Add SQL source ?
22 +# XXX NER preprocessor
23 +
24 +###############################################################################
25 +### NER SOURCE ################################################################
26 +###############################################################################
27 +class AbstractNerdySource(object):
28 +    """ High-level source for Named Entities Recognition
29 +    """
30 +
31 +    def __init__(self, query, endpoint, name=None, use_cache=True, preprocessors=None):
32 +        """ Initialise the class.
33 +        """
34 +        self.query = query
35 +        self.endpoint = endpoint
36 +        self.name = name
37 +        self.preprocessors = preprocessors or []
38 +        self.use_cache = use_cache
39 +        self._recognized_cache = {}
40 +
41 +    def add_preprocessors(self, preprocessor):
42 +        """ Add a preprocessor
43 +        """
44 +        self.preprocessors.append(preprocessor)
45 +
46 +    def recognize_token(self, token):
47 +        """ Recognize a token
48 +        """
49 +        # Applies source specific preprocessors
50 +        for preprocessor in self.preprocessors:
51 +            token = preprocessor(token)
52 +            if not token:
53 +                return []
54 +        if self.use_cache and token.word in self._recognized_cache:
55 +            return self._recognized_cache[token.word]
56 +        uris = self.query_word(token.word) if token.word else []
57 +        if self.use_cache:
58 +            self._recognized_cache[token.word] = uris
59 +        return uris
60 +
61 +    def query_word(self, word):
62 +        """ Query a word for a Named Entities Recognition process
63 +        """
64 +        raise NotImplementedError
65 +
66 +
67 +class NerdySourceLexical(AbstractNerdySource):
68 +    """ Source based on a (pre-computed) dictionnary of words (token, uri)
69 +    """
70 +    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
71 +        self.lexicon = lexicon
72 +        self.name = name
73 +        self.preprocessors = preprocessors or []
74 +        self.use_cache = use_cache
75 +        self._recognized_cache = {}
76 +
77 +    def query_word(self, word):
78 +        uri = self.lexicon.get(word)
79 +        return [uri,] if uri else []
80 +
81 +
82 +class NerdySourceLocalRql(AbstractNerdySource):
83 +    """ High-level source for Named Entities Recognition
84 +    Local RQL version
85 +    """
86 +
87 +    def __init__(self, query, session, name=None, use_cache=True, preprocessors=None):
88 +        """ Initialise the class.
89 +        """
90 +        self.query = query
91 +        self.session = session
92 +        self.name = name
93 +        self.preprocessors = preprocessors or []
94 +        self.use_cache = use_cache
95 +        self._recognized_cache = {}
96 +
97 +    def query_word(self, word):
98 +        """ Query a word for a Named Entities Recognition process
99 +        """
100 +        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
101 +
102 +
103 +class NerdySourceAppidRql(AbstractNerdySource):
104 +    """ High-level source for Named Entities Recognition
105 +    Appid RQL version
106 +    """
107 +
108 +    def query_word(self, word):
109 +        """ Query a word for a Named Entities Recognition process
110 +        """
111 +        return [r[0] for r in rql_appid_query(self.query, self.endpoint, word=word)]
112 +
113 +
114 +class NerdySourceUrlRql(AbstractNerdySource):
115 +    """ High-level source for Named Entities Recognition
116 +    Url RQL version
117 +    """
118 +
119 +    def query_word(self, word):
120 +        """ Query a word for a Named Entities Recognition process
121 +        """
122 +        return [r[0] for r in rql_url_query(self.query % {'word': word}, self.endpoint)]
123 +
124 +
125 +class NerdySourceSparql(AbstractNerdySource):
126 +    """ High-level source for Named Entities Recognition
127 +    SPARQL version
128 +
129 +   >>> from nerdy.core import NerdySourceSparql
130 +   >>> ner_source = NerdySourceSparql('''SELECT ?uri
131 +                                         WHERE{
132 +                                         ?uri rdfs:label "%(word)s"@en}''',
133 +			                 'http://dbpedia.org/sparql')
134 +   >>> print ner_source.recognize_token('Victor Hugo')
135 +		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
136 +		     'http://dbpedia.org/resource/Victor_Hugo',
137 +		     'http://dbpedia.org/class/yago/VictorHugo',
138 +		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
139 +		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
140 +		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
141 +
142 +    """
143 +
144 +    def query_word(self, word):
145 +        """ Query a word for a Named Entities Recognition process
146 +        """
147 +        return [r['uri']['value'] for r in sparql_query(self.query % {'word': word}, self.endpoint)]
148 +
149 +
150 +###############################################################################
151 +### NER PREPROCESSORS #########################################################
152 +###############################################################################
153 +class AbstractNerdyPreprocessor(object):
154 +    """ Preprocessor
155 +    """
156 +
157 +    def __call__(self, token):
158 +        raise NotImplementedError
159 +
160 +
161 +class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
162 +    """ Remove token based on the size of the word
163 +    """
164 +    def __init__(self, min_size=None, max_size=None):
165 +        self.min_size = min_size
166 +        self.max_size = max_size
167 +
168 +    def __call__(self, token):
169 +        if ((self.min_size and len(token.word)<self.min_size)
170 +            or (self.max_size and len(token.word)>self.max_size)):
171 +            return None
172 +        return token
173 +
174 +
175 +class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
176 +    """ Remove token with word in lower case
177 +    """
178 +
179 +    def __call__(self, token):
180 +        return None if token.word.islower() else token
181 +
182 +
183 +class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
184 +    """ Lower the first word of each sentence if it is a stopword.
185 +    """
186 +    def __init__(self, lang='en'):
187 +        self.lang = lang
188 +
189 +    def __call__(self, token):
190 +        if (token.start == token.sentence.start and
191 +            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
192 +            word = token.word[0].lower() + token.word[1:]
193 +            return Token(word, token.start, token.end, token.sentence)
194 +        return token
195 +
196 +
197 +class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
198 +    """ Remove stopwords
199 +    """
200 +    def __init__(self, split_words=False, lang='en'):
201 +        self.split_words = split_words
202 +        self.lang = lang
203 +
204 +    def __call__(self, token):
205 +        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
206 +        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
207 +            return None
208 +        if not self.split_words and token.word.lower() in stopwords:
209 +            return None
210 +        return token
211 +
212 +
213 +class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
214 +    """ Cleanup hashtag
215 +    """
216 +    def __call__(self, token):
217 +        if token.word.startswith('@'):
218 +            # XXX Split capitalize letter ?
219 +            # @BarackObama -> Barack Obama
220 +            word = token.word[1:].replace('_', ' ')
221 +            return Token(word, token.start, token.end, token.sentence)
222 +        return token
223 +
224 +
225 +###############################################################################
226 +### NER FILTERS ###############################################################
227 +###############################################################################
228 +class AbstractNerdyFilter(object):
229 +    """ A filter used for cleaning named entities results
230 +    """
231 +
232 +    def __call__(self, named_entities):
233 +        raise NotImplementedError
234 +
235 +
236 +class NerdyOccurenceFilter(object):
237 +    """ A filter based on the number of occurence of
238 +    named entities in the results.
239 +    """
240 +    def __init__(self, min_occ=None, max_occ=None):
241 +        self.min_occ = min_occ
242 +        self.max_occ = max_occ
243 +
244 +    def __call__(self, named_entities):
245 +        uris = [u for u, p, t in named_entities]
246 +        counts = dict([(u, uris.count(u)) for u in set(uris)])
247 +        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
248 +                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
249 +
250 +
251 +class NerdyRDFTypeFilter(object):
252 +    """ A filter based on the RDF type on entity
253 +    E.g.
254 +
255 +    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
256 +                                ('http://schema.org/Place',
257 +                                'http://dbpedia.org/ontology/Agent',
258 +                                'http://dbpedia.org/ontology/Place'))
259 +
260 +    """
261 +    def __init__(self, endpoint, accepted_types):
262 +        self.endpoint = endpoint
263 +        self.accepted_types = accepted_types
264 +        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
265 +
266 +    def __call__(self, named_entities):
267 +        filtered_named_entities = []
268 +        seen_uris = {}
269 +        for uri, p, t in named_entities:
270 +            if uri in seen_uris:
271 +                if seen_uris[uri]:
272 +                    filtered_named_entities.append((uri, p, t))
273 +            else:
274 +                results = sparql_query(self.query % {'uri': uri}, self.endpoint)
275 +                types = set([r['type']['value'] for r in results])
276 +                if not len(types.intersection(self.accepted_types)):
277 +                    seen_uris[uri] = False
278 +                else:
279 +                    seen_uris[uri] = True
280 +                    filtered_named_entities.append((uri, p, t))
281 +        return filtered_named_entities
282 +
283 +
284 +class NerdyDisambiguationWordParts(object):
285 +    """ Disambiguate named entities based on the words parts.
286 +    E.g.:
287 +          'toto tutu': 'http://example.com/toto_tutu',
288 +          'toto': 'http://example.com/toto'
289 +
290 +          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
291 +          by 'http://example.com/toto_tutu'
292 +    """
293 +    def __call__(self, named_entities):
294 +        # Create parts dictionnary
295 +        parts = {}
296 +        for uri, peid, token in named_entities:
297 +            if ' ' in token.word:
298 +                for part in token.word.split(' '):
299 +                    parts[part.lower()] = uri
300 +        # Replace named entities
301 +        filtered_named_entities = []
302 +        for uri, peid, token in named_entities:
303 +            if token.word in parts:
304 +                # Change URI
305 +                uri = parts[token.word]
306 +            filtered_named_entities.append((uri, peid, token))
307 +        return filtered_named_entities
308 +
309 +
310 +class NerdyReplacementRulesFilter(object):
311 +    """ Allow to define replacement rules for Named Entities
312 +    """
313 +    def __init__(self,rules):
314 +        self.rules = rules
315 +
316 +    def __call__(self, named_entities):
317 +        filtered_named_entities = []
318 +        for uri, peid, token in named_entities:
319 +            uri = self.rules.get(uri, uri)
320 +            filtered_named_entities.append((uri, peid, token))
321 +        return filtered_named_entities
322 +
323 +
324 +###############################################################################
325 +### NER PROCESS ###############################################################
326 +###############################################################################
327 +class NerdyProcess(object):
328 +    """ High-level process for Named Entities Recognition
329 +    """
330 +
331 +    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
332 +        """ Initialise the class.
333 +
334 +        :tokenizer: an instance of tokenizer
335 +        """
336 +        self.ner_sources = list(ner_sources)
337 +        self.preprocessors = preprocessors or []
338 +        self.filters = filters or []
339 +        self.unique = unique
340 +
341 +    def add_ner_source(self, process):
342 +        """ Add a ner process
343 +        """
344 +        self.ner_sources.append(process)
345 +
346 +    def add_preprocessors(self, preprocessor):
347 +        """ Add a preprocessor
348 +        """
349 +        self.preprocessors.append(preprocessor)
350 +
351 +    def add_filters(self, filter):
352 +        """ Add a filter
353 +        """
354 +        self.filters.append(filter)
355 +
356 +    def process_text(self, text):
357 +        """ High level function for analyzing a text
358 +        """
359 +        tokenizer = RichStringTokenizer(text)
360 +        return self.recognize_tokens(tokenizer)
361 +
362 +    def recognize_tokens(self, tokens):
363 +        """ Recognize Named Entities from a tokenizer or
364 +        an iterator yielding tokens.
365 +        """
366 +        last_stop = 0
367 +        named_entities = []
368 +        for token in tokens:
369 +            if token.start < last_stop:
370 +                continue # this token overlaps with a previous match
371 +            word = token.word
372 +            # Applies preprocessors
373 +            # XXX Preprocessors may be sources dependant
374 +            for preprocessor in self.preprocessors:
375 +                token = preprocessor(token)
376 +                if not token:
377 +                    break
378 +            if not token:
379 +                continue
380 +            recognized = False
381 +            for process in self.ner_sources:
382 +                for uri in process.recognize_token(token):
383 +                    named_entities.append((uri, process.name, token))
384 +                    recognized = True
385 +                    last_stop = token.end
386 +                    if self.unique:
387 +                        break
388 +                if recognized and self.unique:
389 +                    break
390 +        # XXX Postprocess/filters may be sources dependant
391 +        return self.postprocess(named_entities)
392 +
393 +    def postprocess(self, named_entities):
394 +        """ Postprocess the results by applying filters """
395 +        for filter in self.filters:
396 +            named_entities = filter(named_entities)
397 +        return named_entities
398 +
399 +
400 +###############################################################################
401 +### NER RELATIONS PROCESS #####################################################
402 +###############################################################################
403 +class NerdyRelationsProcess(object):
404 +    """ Process for building simple relation from named entities results
405 +    """
406 +    pass
diff --git a/named_entities/tokenizer.py b/named_entities/tokenizer.py
@@ -0,0 +1,66 @@
407 +# -*- coding: utf-8 -*-
408 +""" Tokenizer for sentences/words segmentation.
409 +"""
410 +import itertools
411 +import collections
412 +import re
413 +
414 +
415 +Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
416 +Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
417 +
418 +
419 +class RichStringTokenizer(object):
420 +    """Tokenizer for Yams' RichString content.
421 +
422 +    The tokenizer uses a variable-length sliding window, i.e. a sliding
423 +    window yielding tokens of N words.
424 +    """
425 +
426 +    def __init__(self, text, token_min_size=1, token_max_size=3):
427 +        """
428 +        :token_min_size: minimum number of words required to be a valid token
429 +        :token_max_size: minimum number of words required to be a valid token
430 +        """
431 +        self.text = text
432 +        self.token_min_size = token_min_size
433 +        self.token_max_size = token_max_size
434 +
435 +    def iter_tokens(self, text):
436 +        """ Iterate tokens over a text
437 +        """
438 +        # Compute sentences
439 +        sentences = self.find_sentences(text)
440 +        # Compute words
441 +        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
442 +        indice = 0
443 +        while indice < len(words):
444 +            # Choose the current sentence of the first word
445 +            current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
446 +            # Sliding windows over the different words for each sentence
447 +            remaining = len(words) - indice
448 +            for length in range(min(self.token_max_size, remaining), self.token_min_size-1, -1):
449 +                _words = words[indice:indice+length]
450 +                if _words[-1].start() > current_sentence.end:
451 +                    # The last word in not in the same sentence anymore, split
452 +                    continue
453 +                normalized_word = ' '.join([w.group() for w in _words]).strip()
454 +                yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
455 +            indice += 1
456 +
457 +    def find_sentences(self, text):
458 +        """ Find the sentences
459 +        """
460 +        return [Sentence(ind, s.start(), s.end()) for ind, s in
461 +                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
462 +
463 +    def load_text(self, text):
464 +        """ Load the text to be tokenized
465 +        """
466 +        self.text = text
467 +
468 +    def __iter__(self):
469 +        """ Iterator over the text given in the object instantiation
470 +        """
471 +        for t in self.iter_tokens(self.text):
472 +            yield t
diff --git a/ner/__init__.py b/ner/__init__.py
diff --git a/ner/core.py b/ner/core.py
@@ -1,396 +0,0 @@
473 -# -*- coding: utf-8 -*-
474 -""" Core functions for Named Entities Recognition.
475 -"""
476 -from nerdy.tokenizer import RichStringTokenizer, Token
477 -from nerdy.dataio import sparql_query, rql_url_query, rql_appid_query
478 -from nerdy.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
479 -
480 -STOPWORDS = {'fr': FRENCH_STOPWORDS,
481 -             'en': ENGLISH_STOPWORDS}
482 -
483 -# XXX Add SQL source ?
484 -# XXX NER preprocessor
485 -
486 -###############################################################################
487 -### NER SOURCE ################################################################
488 -###############################################################################
489 -class AbstractNerdySource(object):
490 -    """ High-level source for Named Entities Recognition
491 -    """
492 -
493 -    def __init__(self, query, endpoint, name=None, use_cache=True, preprocessors=None):
494 -        """ Initialise the class.
495 -        """
496 -        self.query = query
497 -        self.endpoint = endpoint
498 -        self.name = name
499 -        self.preprocessors = preprocessors or []
500 -        self.use_cache = use_cache
501 -        self._recognized_cache = {}
502 -
503 -    def add_preprocessors(self, preprocessor):
504 -        """ Add a preprocessor
505 -        """
506 -        self.preprocessors.append(preprocessor)
507 -
508 -    def recognize_token(self, token):
509 -        """ Recognize a token
510 -        """
511 -        # Applies source specific preprocessors
512 -        for preprocessor in self.preprocessors:
513 -            token = preprocessor(token)
514 -            if not token:
515 -                return []
516 -        if self.use_cache and token.word in self._recognized_cache:
517 -            return self._recognized_cache[token.word]
518 -        uris = self.query_word(token.word) if token.word else []
519 -        if self.use_cache:
520 -            self._recognized_cache[token.word] = uris
521 -        return uris
522 -
523 -    def query_word(self, word):
524 -        """ Query a word for a Named Entities Recognition process
525 -        """
526 -        raise NotImplementedError
527 -
528 -
529 -class NerdySourceLexical(AbstractNerdySource):
530 -    """ Source based on a (pre-computed) dictionnary of words (token, uri)
531 -    """
532 -    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
533 -        self.lexicon = lexicon
534 -        self.name = name
535 -        self.preprocessors = preprocessors or []
536 -        self.use_cache = use_cache
537 -        self._recognized_cache = {}
538 -
539 -    def query_word(self, word):
540 -        uri = self.lexicon.get(word)
541 -        return [uri,] if uri else []
542 -
543 -
544 -class NerdySourceLocalRql(AbstractNerdySource):
545 -    """ High-level source for Named Entities Recognition
546 -    Local RQL version
547 -    """
548 -
549 -    def __init__(self, query, session, name=None, use_cache=True, preprocessors=None):
550 -        """ Initialise the class.
551 -        """
552 -        self.query = query
553 -        self.session = session
554 -        self.name = name
555 -        self.preprocessors = preprocessors or []
556 -        self.use_cache = use_cache
557 -        self._recognized_cache = {}
558 -
559 -    def query_word(self, word):
560 -        """ Query a word for a Named Entities Recognition process
561 -        """
562 -        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
563 -
564 -
565 -class NerdySourceAppidRql(AbstractNerdySource):
566 -    """ High-level source for Named Entities Recognition
567 -    Appid RQL version
568 -    """
569 -
570 -    def query_word(self, word):
571 -        """ Query a word for a Named Entities Recognition process
572 -        """
573 -        return [r[0] for r in rql_appid_query(self.query, self.endpoint, word=word)]
574 -
575 -
576 -class NerdySourceUrlRql(AbstractNerdySource):
577 -    """ High-level source for Named Entities Recognition
578 -    Url RQL version
579 -    """
580 -
581 -    def query_word(self, word):
582 -        """ Query a word for a Named Entities Recognition process
583 -        """
584 -        return [r[0] for r in rql_url_query(self.query % {'word': word}, self.endpoint)]
585 -
586 -
587 -class NerdySourceSparql(AbstractNerdySource):
588 -    """ High-level source for Named Entities Recognition
589 -    SPARQL version
590 -
591 -   >>> from nerdy.core import NerdySourceSparql
592 -   >>> ner_source = NerdySourceSparql('''SELECT ?uri
593 -                                         WHERE{
594 -                                         ?uri rdfs:label "%(word)s"@en}''',
595 -			                 'http://dbpedia.org/sparql')
596 -   >>> print ner_source.recognize_token('Victor Hugo')
597 -		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
598 -		     'http://dbpedia.org/resource/Victor_Hugo',
599 -		     'http://dbpedia.org/class/yago/VictorHugo',
600 -		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
601 -		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
602 -		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
603 -
604 -    """
605 -
606 -    def query_word(self, word):
607 -        """ Query a word for a Named Entities Recognition process
608 -        """
609 -        return [r['uri']['value'] for r in sparql_query(self.query % {'word': word}, self.endpoint)]
610 -
611 -
612 -###############################################################################
613 -### NER PREPROCESSORS #########################################################
614 -###############################################################################
615 -class AbstractNerdyPreprocessor(object):
616 -    """ Preprocessor
617 -    """
618 -
619 -    def __call__(self, token):
620 -        raise NotImplementedError
621 -
622 -
623 -class NerdyWordSizeFilterPreprocessor(AbstractNerdyPreprocessor):
624 -    """ Remove token based on the size of the word
625 -    """
626 -    def __init__(self, min_size=None, max_size=None):
627 -        self.min_size = min_size
628 -        self.max_size = max_size
629 -
630 -    def __call__(self, token):
631 -        if ((self.min_size and len(token.word)<self.min_size)
632 -            or (self.max_size and len(token.word)>self.max_size)):
633 -            return None
634 -        return token
635 -
636 -
637 -class NerdyLowerCaseFilterPreprocessor(AbstractNerdyPreprocessor):
638 -    """ Remove token with word in lower case
639 -    """
640 -
641 -    def __call__(self, token):
642 -        return None if token.word.islower() else token
643 -
644 -
645 -class NerdyLowerFirstWordPreprocessor(AbstractNerdyPreprocessor):
646 -    """ Lower the first word of each sentence if it is a stopword.
647 -    """
648 -    def __init__(self, lang='en'):
649 -        self.lang = lang
650 -
651 -    def __call__(self, token):
652 -        if (token.start == token.sentence.start and
653 -            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
654 -            word = token.word[0].lower() + token.word[1:]
655 -            return Token(word, token.start, token.end, token.sentence)
656 -        return token
657 -
658 -
659 -class NerdyStopwordsFilterPreprocessor(AbstractNerdyPreprocessor):
660 -    """ Remove stopwords
661 -    """
662 -    def __init__(self, split_words=False, lang='en'):
663 -        self.split_words = split_words
664 -        self.lang = lang
665 -
666 -    def __call__(self, token):
667 -        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
668 -        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
669 -            return None
670 -        if not self.split_words and token.word.lower() in stopwords:
671 -            return None
672 -        return token
673 -
674 -
675 -class NerdyHashTagPreprocessor(AbstractNerdyPreprocessor):
676 -    """ Cleanup hashtag
677 -    """
678 -    def __call__(self, token):
679 -        if token.word.startswith('@'):
680 -            # XXX Split capitalize letter ?
681 -            # @BarackObama -> Barack Obama
682 -            word = token.word[1:].replace('_', ' ')
683 -            return Token(word, token.start, token.end, token.sentence)
684 -        return token
685 -
686 -
687 -###############################################################################
688 -### NER FILTERS ###############################################################
689 -###############################################################################
690 -class AbstractNerdyFilter(object):
691 -    """ A filter used for cleaning named entities results
692 -    """
693 -
694 -    def __call__(self, named_entities):
695 -        raise NotImplementedError
696 -
697 -
698 -class NerdyOccurenceFilter(object):
699 -    """ A filter based on the number of occurence of
700 -    named entities in the results.
701 -    """
702 -    def __init__(self, min_occ=None, max_occ=None):
703 -        self.min_occ = min_occ
704 -        self.max_occ = max_occ
705 -
706 -    def __call__(self, named_entities):
707 -        uris = [u for u, p, t in named_entities]
708 -        counts = dict([(u, uris.count(u)) for u in set(uris)])
709 -        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
710 -                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
711 -
712 -
713 -class NerdyRDFTypeFilter(object):
714 -    """ A filter based on the RDF type on entity
715 -    E.g.
716 -
717 -    filter = NerdyRDFTypeFilter('http://dbpedia.org/sparql',
718 -                                ('http://schema.org/Place',
719 -                                'http://dbpedia.org/ontology/Agent',
720 -                                'http://dbpedia.org/ontology/Place'))
721 -
722 -    """
723 -    def __init__(self, endpoint, accepted_types):
724 -        self.endpoint = endpoint
725 -        self.accepted_types = accepted_types
726 -        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
727 -
728 -    def __call__(self, named_entities):
729 -        filtered_named_entities = []
730 -        seen_uris = {}
731 -        for uri, p, t in named_entities:
732 -            if uri in seen_uris:
733 -                if seen_uris[uri]:
734 -                    filtered_named_entities.append((uri, p, t))
735 -            else:
736 -                results = sparql_query(self.query % {'uri': uri}, self.endpoint)
737 -                types = set([r['type']['value'] for r in results])
738 -                if not len(types.intersection(self.accepted_types)):
739 -                    seen_uris[uri] = False
740 -                else:
741 -                    seen_uris[uri] = True
742 -                    filtered_named_entities.append((uri, p, t))
743 -        return filtered_named_entities
744 -
745 -
746 -class NerdyDisambiguationWordParts(object):
747 -    """ Disambiguate named entities based on the words parts.
748 -    E.g.:
749 -          'toto tutu': 'http://example.com/toto_tutu',
750 -          'toto': 'http://example.com/toto'
751 -
752 -          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
753 -          by 'http://example.com/toto_tutu'
754 -    """
755 -    def __call__(self, named_entities):
756 -        # Create parts dictionnary
757 -        parts = {}
758 -        for uri, peid, token in named_entities:
759 -            if ' ' in token.word:
760 -                for part in token.word.split(' '):
761 -                    parts[part.lower()] = uri
762 -        # Replace named entities
763 -        filtered_named_entities = []
764 -        for uri, peid, token in named_entities:
765 -            if token.word in parts:
766 -                # Change URI
767 -                uri = parts[token.word]
768 -            filtered_named_entities.append((uri, peid, token))
769 -        return filtered_named_entities
770 -
771 -
772 -class NerdyReplacementRulesFilter(object):
773 -    """ Allow to define replacement rules for Named Entities
774 -    """
775 -    def __init__(self,rules):
776 -        self.rules = rules
777 -
778 -    def __call__(self, named_entities):
779 -        filtered_named_entities = []
780 -        for uri, peid, token in named_entities:
781 -            uri = self.rules.get(uri, uri)
782 -            filtered_named_entities.append((uri, peid, token))
783 -        return filtered_named_entities
784 -
785 -
786 -###############################################################################
787 -### NER PROCESS ###############################################################
788 -###############################################################################
789 -class NerdyProcess(object):
790 -    """ High-level process for Named Entities Recognition
791 -    """
792 -
793 -    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
794 -        """ Initialise the class.
795 -
796 -        :tokenizer: an instance of tokenizer
797 -        """
798 -        self.ner_sources = list(ner_sources)
799 -        self.preprocessors = preprocessors or []
800 -        self.filters = filters or []
801 -        self.unique = unique
802 -
803 -    def add_ner_source(self, process):
804 -        """ Add a ner process
805 -        """
806 -        self.ner_sources.append(process)
807 -
808 -    def add_preprocessors(self, preprocessor):
809 -        """ Add a preprocessor
810 -        """
811 -        self.preprocessors.append(preprocessor)
812 -
813 -    def add_filters(self, filter):
814 -        """ Add a filter
815 -        """
816 -        self.filters.append(filter)
817 -
818 -    def process_text(self, text):
819 -        """ High level function for analyzing a text
820 -        """
821 -        tokenizer = RichStringTokenizer(text)
822 -        return self.recognize_tokens(tokenizer)
823 -
824 -    def recognize_tokens(self, tokens):
825 -        """ Recognize Named Entities from a tokenizer or
826 -        an iterator yielding tokens.
827 -        """
828 -        last_stop = 0
829 -        named_entities = []
830 -        for token in tokens:
831 -            if token.start < last_stop:
832 -                continue # this token overlaps with a previous match
833 -            word = token.word
834 -            # Applies preprocessors
835 -            # XXX Preprocessors may be sources dependant
836 -            for preprocessor in self.preprocessors:
837 -                token = preprocessor(token)
838 -                if not token:
839 -                    break
840 -            if not token:
841 -                continue
842 -            recognized = False
843 -            for process in self.ner_sources:
844 -                for uri in process.recognize_token(token):
845 -                    named_entities.append((uri, process.name, token))
846 -                    recognized = True
847 -                    last_stop = token.end
848 -                    if self.unique:
849 -                        break
850 -                if recognized and self.unique:
851 -                    break
852 -        # XXX Postprocess/filters may be sources dependant
853 -        return self.postprocess(named_entities)
854 -
855 -    def postprocess(self, named_entities):
856 -        """ Postprocess the results by applying filters """
857 -        for filter in self.filters:
858 -            named_entities = filter(named_entities)
859 -        return named_entities
860 -
861 -
862 -###############################################################################
863 -### NER RELATIONS PROCESS #####################################################
864 -###############################################################################
865 -class NerdyRelationsProcess(object):
866 -    """ Process for building simple relation from named entities results
867 -    """
868 -    pass
diff --git a/ner/tokenizer.py b/ner/tokenizer.py
@@ -1,66 +0,0 @@
869 -# -*- coding: utf-8 -*-
870 -""" Tokenizer for sentences/words segmentation.
871 -"""
872 -import itertools
873 -import collections
874 -import re
875 -
876 -
877 -Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
878 -Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
879 -
880 -
881 -class RichStringTokenizer(object):
882 -    """Tokenizer for Yams' RichString content.
883 -
884 -    The tokenizer uses a variable-length sliding window, i.e. a sliding
885 -    window yielding tokens of N words.
886 -    """
887 -
888 -    def __init__(self, text, token_min_size=1, token_max_size=3):
889 -        """
890 -        :token_min_size: minimum number of words required to be a valid token
891 -        :token_max_size: minimum number of words required to be a valid token
892 -        """
893 -        self.text = text
894 -        self.token_min_size = token_min_size
895 -        self.token_max_size = token_max_size
896 -
897 -    def iter_tokens(self, text):
898 -        """ Iterate tokens over a text
899 -        """
900 -        # Compute sentences
901 -        sentences = self.find_sentences(text)
902 -        # Compute words
903 -        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
904 -        indice = 0
905 -        while indice < len(words):
906 -            # Choose the current sentence of the first word
907 -            current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
908 -            # Sliding windows over the different words for each sentence
909 -            remaining = len(words) - indice
910 -            for length in range(min(self.token_max_size, remaining), self.token_min_size-1, -1):
911 -                _words = words[indice:indice+length]
912 -                if _words[-1].start() > current_sentence.end:
913 -                    # The last word in not in the same sentence anymore, split
914 -                    continue
915 -                normalized_word = ' '.join([w.group() for w in _words]).strip()
916 -                yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
917 -            indice += 1
918 -
919 -    def find_sentences(self, text):
920 -        """ Find the sentences
921 -        """
922 -        return [Sentence(ind, s.start(), s.end()) for ind, s in
923 -                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
924 -
925 -    def load_text(self, text):
926 -        """ Load the text to be tokenized
927 -        """
928 -        self.text = text
929 -
930 -    def __iter__(self):
931 -        """ Iterator over the text given in the object instantiation
932 -        """
933 -        for t in self.iter_tokens(self.text):
934 -            yield t