[named entities] Move tokenizer to utils and create a sources module for named entities, related to #187461

authorvincent.michel@logilab.fr
changesetcc142a884361
branchdefault
phasedraft
hiddenyes
parent revision#ec7d7ce1ca35 [dataio] Merge dataio and tests, related to #187461
child revision#6a0b643b9e78 [named entities] Split core into preprocessors and filters modules, related to #187461
files modified by this revision
named_entities/named_entities.py
named_entities/sources.py
named_entities/tokenizer.py
test/test_named_entities.py
test/test_tokenizer.py
utils/dataio.py
utils/tokenizer.py
# HG changeset patch
# User vincent.michel@logilab.fr
# Date 1387464326 0
# Thu Dec 19 14:45:26 2013 +0000
# Node ID cc142a884361048f68ddaf6a0c3ce76a6bd430ee
# Parent ec7d7ce1ca35a577deaa8f516b17a5d29f3b11f3
[named entities] Move tokenizer to utils and create a sources module for named entities, related to #187461

diff --git a/named_entities/named_entities.py b/named_entities/named_entities.py
@@ -1,143 +1,15 @@
1  # -*- coding: utf-8 -*-
2  """ Core functions for Named Entities Recognition.
3  """
4 -from nerdy.tokenizer import RichStringTokenizer, Token
5 -from nerdy.dataio import sparql_query, rql_url_query, rql_appid_query
6 -from nerdy.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
7 +from nazca.utils.tokenizer import RichStringTokenizer, Token
8 +from nazca.utils.dataio import sparqlquery
9 +from nazca.reference_data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
10 
11  STOPWORDS = {'fr': FRENCH_STOPWORDS,
12               'en': ENGLISH_STOPWORDS}
13 
14 -# XXX Add SQL source ?
15 -# XXX NER preprocessor
16 -
17 -###############################################################################
18 -### NER SOURCE ################################################################
19 -###############################################################################
20 -class AbstractNerdySource(object):
21 -    """ High-level source for Named Entities Recognition
22 -    """
23 -
24 -    def __init__(self, query, endpoint, name=None, use_cache=True, preprocessors=None):
25 -        """ Initialise the class.
26 -        """
27 -        self.query = query
28 -        self.endpoint = endpoint
29 -        self.name = name
30 -        self.preprocessors = preprocessors or []
31 -        self.use_cache = use_cache
32 -        self._recognized_cache = {}
33 -
34 -    def add_preprocessors(self, preprocessor):
35 -        """ Add a preprocessor
36 -        """
37 -        self.preprocessors.append(preprocessor)
38 -
39 -    def recognize_token(self, token):
40 -        """ Recognize a token
41 -        """
42 -        # Applies source specific preprocessors
43 -        for preprocessor in self.preprocessors:
44 -            token = preprocessor(token)
45 -            if not token:
46 -                return []
47 -        if self.use_cache and token.word in self._recognized_cache:
48 -            return self._recognized_cache[token.word]
49 -        uris = self.query_word(token.word) if token.word else []
50 -        if self.use_cache:
51 -            self._recognized_cache[token.word] = uris
52 -        return uris
53 -
54 -    def query_word(self, word):
55 -        """ Query a word for a Named Entities Recognition process
56 -        """
57 -        raise NotImplementedError
58 -
59 -
60 -class NerdySourceLexical(AbstractNerdySource):
61 -    """ Source based on a (pre-computed) dictionnary of words (token, uri)
62 -    """
63 -    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
64 -        self.lexicon = lexicon
65 -        self.name = name
66 -        self.preprocessors = preprocessors or []
67 -        self.use_cache = use_cache
68 -        self._recognized_cache = {}
69 -
70 -    def query_word(self, word):
71 -        uri = self.lexicon.get(word)
72 -        return [uri,] if uri else []
73 -
74 -
75 -class NerdySourceLocalRql(AbstractNerdySource):
76 -    """ High-level source for Named Entities Recognition
77 -    Local RQL version
78 -    """
79 -
80 -    def __init__(self, query, session, name=None, use_cache=True, preprocessors=None):
81 -        """ Initialise the class.
82 -        """
83 -        self.query = query
84 -        self.session = session
85 -        self.name = name
86 -        self.preprocessors = preprocessors or []
87 -        self.use_cache = use_cache
88 -        self._recognized_cache = {}
89 -
90 -    def query_word(self, word):
91 -        """ Query a word for a Named Entities Recognition process
92 -        """
93 -        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
94 -
95 -
96 -class NerdySourceAppidRql(AbstractNerdySource):
97 -    """ High-level source for Named Entities Recognition
98 -    Appid RQL version
99 -    """
100 -
101 -    def query_word(self, word):
102 -        """ Query a word for a Named Entities Recognition process
103 -        """
104 -        return [r[0] for r in rql_appid_query(self.query, self.endpoint, word=word)]
105 -
106 -
107 -class NerdySourceUrlRql(AbstractNerdySource):
108 -    """ High-level source for Named Entities Recognition
109 -    Url RQL version
110 -    """
111 -
112 -    def query_word(self, word):
113 -        """ Query a word for a Named Entities Recognition process
114 -        """
115 -        return [r[0] for r in rql_url_query(self.query % {'word': word}, self.endpoint)]
116 -
117 -
118 -class NerdySourceSparql(AbstractNerdySource):
119 -    """ High-level source for Named Entities Recognition
120 -    SPARQL version
121 -
122 -   >>> from nerdy.core import NerdySourceSparql
123 -   >>> ner_source = NerdySourceSparql('''SELECT ?uri
124 -                                         WHERE{
125 -                                         ?uri rdfs:label "%(word)s"@en}''',
126 -			                 'http://dbpedia.org/sparql')
127 -   >>> print ner_source.recognize_token('Victor Hugo')
128 -		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
129 -		     'http://dbpedia.org/resource/Victor_Hugo',
130 -		     'http://dbpedia.org/class/yago/VictorHugo',
131 -		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
132 -		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
133 -		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
134 -
135 -    """
136 -
137 -    def query_word(self, word):
138 -        """ Query a word for a Named Entities Recognition process
139 -        """
140 -        return [r['uri']['value'] for r in sparql_query(self.query % {'word': word}, self.endpoint)]
141 -
142 
143  ###############################################################################
144  ### NER PREPROCESSORS #########################################################
145  ###############################################################################
146  class AbstractNerdyPreprocessor(object):
@@ -259,11 +131,11 @@
147          for uri, p, t in named_entities:
148              if uri in seen_uris:
149                  if seen_uris[uri]:
150                      filtered_named_entities.append((uri, p, t))
151              else:
152 -                results = sparql_query(self.query % {'uri': uri}, self.endpoint)
153 +                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
154                  types = set([r['type']['value'] for r in results])
155                  if not len(types.intersection(self.accepted_types)):
156                      seen_uris[uri] = False
157                  else:
158                      seen_uris[uri] = True
diff --git a/named_entities/sources.py b/named_entities/sources.py
@@ -0,0 +1,124 @@
159 +# -*- coding: utf-8 -*-
160 +""" Core functions for Named Entities Recognition.
161 +"""
162 +from nazca.utils.tokenizer import Token
163 +from nazca.utils.dataio import sparqlquery, rqlquery
164 +
165 +
166 +###############################################################################
167 +### NER SOURCE ################################################################
168 +###############################################################################
169 +class AbstractNerSource(object):
170 +    """ High-level source for Named Entities Recognition
171 +    """
172 +
173 +    def __init__(self, endpoint, query, name=None, use_cache=True, preprocessors=None):
174 +        """ Initialise the class.
175 +        """
176 +        self.endpoint = endpoint
177 +        self.query = query
178 +        self.name = name
179 +        self.preprocessors = preprocessors or []
180 +        self.use_cache = use_cache
181 +        self._recognized_cache = {}
182 +
183 +    def add_preprocessors(self, preprocessor):
184 +        """ Add a preprocessor
185 +        """
186 +        self.preprocessors.append(preprocessor)
187 +
188 +    def recognize_token(self, token):
189 +        """ Recognize a token
190 +        """
191 +        # Applies source specific preprocessors
192 +        for preprocessor in self.preprocessors:
193 +            token = preprocessor(token)
194 +            if not token:
195 +                return []
196 +        if self.use_cache and token.word in self._recognized_cache:
197 +            return self._recognized_cache[token.word]
198 +        uris = self.query_word(token.word) if token.word else []
199 +        if self.use_cache:
200 +            self._recognized_cache[token.word] = uris
201 +        return uris
202 +
203 +    def query_word(self, word):
204 +        """ Query a word for a Named Entities Recognition process
205 +        """
206 +        raise NotImplementedError
207 +
208 +
209 +class NerSourceLexicon(AbstractNerSource):
210 +    """ Source based on a (pre-computed) dictionnary of words (token, uri)
211 +    """
212 +    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
213 +        self.lexicon = lexicon
214 +        self.name = name
215 +        self.preprocessors = preprocessors or []
216 +        self.use_cache = use_cache
217 +        self._recognized_cache = {}
218 +
219 +    def query_word(self, word):
220 +        uri = self.lexicon.get(word)
221 +        return [uri,] if uri else []
222 +
223 +
224 +class NerSourceLocalRql(AbstractNerSource):
225 +    """ High-level source for Named Entities Recognition
226 +    Local RQL version
227 +    """
228 +
229 +    def __init__(self, session, query, name=None, use_cache=True, preprocessors=None):
230 +        """ Initialise the class.
231 +        """
232 +        self.query = query
233 +        self.session = session
234 +        self.name = name
235 +        self.preprocessors = preprocessors or []
236 +        self.use_cache = use_cache
237 +        self._recognized_cache = {}
238 +
239 +    def query_word(self, word):
240 +        """ Query a word for a Named Entities Recognition process
241 +        """
242 +        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
243 +
244 +
245 +class NerSourceRql(AbstractNerSource):
246 +    """ High-level source for Named Entities Recognition
247 +    Url version (distant source)
248 +    """
249 +
250 +    def query_word(self, word):
251 +        """ Query a word for a Named Entities Recognition process
252 +        """
253 +        if self.endpoint.startswith('http://'):
254 +            # url
255 +            return [r[0] for r in rqlquery(self.endpoint, self.query % {'word': word})]
256 +        else:
257 +            return [r[0] for r in rqlquery(self.endpoint, self.query, word=word)]
258 +
259 +
260 +class NerSourceSparql(AbstractNerSource):
261 +    """ High-level source for Named Entities Recognition
262 +    SPARQL version
263 +
264 +   >>> from nerdy.core import NerSourceSparql
265 +   >>> ner_source = NerSourceSparql('''SELECT ?uri
266 +                                         WHERE{
267 +                                         ?uri rdfs:label "%(word)s"@en}''',
268 +			                 'http://dbpedia.org/sparql')
269 +   >>> print ner_source.recognize_token('Victor Hugo')
270 +		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
271 +		     'http://dbpedia.org/resource/Victor_Hugo',
272 +		     'http://dbpedia.org/class/yago/VictorHugo',
273 +		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
274 +		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
275 +		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
276 +
277 +    """
278 +
279 +    def query_word(self, word):
280 +        """ Query a word for a Named Entities Recognition process
281 +        """
282 +        return [r['uri']['value'] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
diff --git a/named_entities/tokenizer.py b/named_entities/tokenizer.py
@@ -1,66 +0,0 @@
283 -# -*- coding: utf-8 -*-
284 -""" Tokenizer for sentences/words segmentation.
285 -"""
286 -import itertools
287 -import collections
288 -import re
289 -
290 -
291 -Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
292 -Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
293 -
294 -
295 -class RichStringTokenizer(object):
296 -    """Tokenizer for Yams' RichString content.
297 -
298 -    The tokenizer uses a variable-length sliding window, i.e. a sliding
299 -    window yielding tokens of N words.
300 -    """
301 -
302 -    def __init__(self, text, token_min_size=1, token_max_size=3):
303 -        """
304 -        :token_min_size: minimum number of words required to be a valid token
305 -        :token_max_size: minimum number of words required to be a valid token
306 -        """
307 -        self.text = text
308 -        self.token_min_size = token_min_size
309 -        self.token_max_size = token_max_size
310 -
311 -    def iter_tokens(self, text):
312 -        """ Iterate tokens over a text
313 -        """
314 -        # Compute sentences
315 -        sentences = self.find_sentences(text)
316 -        # Compute words
317 -        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
318 -        indice = 0
319 -        while indice < len(words):
320 -            # Choose the current sentence of the first word
321 -            current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
322 -            # Sliding windows over the different words for each sentence
323 -            remaining = len(words) - indice
324 -            for length in range(min(self.token_max_size, remaining), self.token_min_size-1, -1):
325 -                _words = words[indice:indice+length]
326 -                if _words[-1].start() > current_sentence.end:
327 -                    # The last word in not in the same sentence anymore, split
328 -                    continue
329 -                normalized_word = ' '.join([w.group() for w in _words]).strip()
330 -                yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
331 -            indice += 1
332 -
333 -    def find_sentences(self, text):
334 -        """ Find the sentences
335 -        """
336 -        return [Sentence(ind, s.start(), s.end()) for ind, s in
337 -                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
338 -
339 -    def load_text(self, text):
340 -        """ Load the text to be tokenized
341 -        """
342 -        self.text = text
343 -
344 -    def __iter__(self):
345 -        """ Iterator over the text given in the object instantiation
346 -        """
347 -        for t in self.iter_tokens(self.text):
348 -            yield t
diff --git a/test/test_named_entities.py b/test/test_named_entities.py
@@ -15,22 +15,25 @@
349  #
350  # You should have received a copy of the GNU Lesser General Public License along
351  # with this program. If not, see <http://www.gnu.org/licenses/>.
352  import unittest2
353 
354 -from nerdy import core
355 -from nerdy.tokenizer import Token, Sentence
356 +from nazca.named_entities.sources import (NerSourceLexicon,
357 +                                          NerSourceSparql,
358 +                                          NerSourceRql)
359 +from nazca.named_entities import named_entities as core
360 +from nazca.utils.tokenizer import Token, Sentence
361 
362 
363  class CoreTest(unittest2.TestCase):
364      """ Test of core """
365 
366 -    def test_lexical_source(self):
367 -        """ Test lexical source """
368 +    def test_lexicon_source(self):
369 +        """ Test lexicon source """
370          lexicon = {'everyone': 'http://example.com/everyone',
371                     'me': 'http://example.com/me'}
372 -        source = core.NerdySourceLexical(lexicon)
373 +        source = NerSourceLexicon(lexicon)
374          self.assertEqual(source.query_word('me'), ['http://example.com/me',])
375          self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
376          self.assertEqual(source.query_word('me everyone'), [])
377          self.assertEqual(source.query_word('toto'), [])
378          # Token
@@ -39,30 +42,30 @@
379          token = Token('ma', 0, 2, None)
380          self.assertEqual(source.recognize_token(token), [])
381 
382      def test_rql_source(self):
383          """ Test rql source """
384 -        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
385 -                                       'http://www.cubicweb.org')
386 +        source = NerSourceRql('http://www.cubicweb.org',
387 +                              'Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"')
388          self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
389 
390      def test_sparql_source(self):
391          """ Test sparql source """
392 -        source = core.NerdySourceSparql(u'''SELECT ?uri
393 -                                            WHERE{
394 -                                            ?uri rdfs:label "Python"@en .
395 -                                            ?uri rdf:type ?type}''',
396 -                                        u'http://dbpedia.org/sparql')
397 +        source = NerSourceSparql(u'http://dbpedia.org/sparql',
398 +                                 u'''SELECT ?uri
399 +                                     WHERE{
400 +                                     ?uri rdfs:label "Python"@en .
401 +                                     ?uri rdf:type ?type}''')
402          self.assertEqual(source.query_word('cubicweb'),
403                           [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
404                            u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
405 
406      def test_nerdy_process(self):
407          """ Test nerdy process """
408          text = 'Hello everyone, this is   me speaking. And me.'
409 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
410 -                                          'me': 'http://example.com/me'})
411 +        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
412 +                                   'me': 'http://example.com/me'})
413          nerdy = core.NerdyProcess((source,))
414          named_entities = nerdy.process_text(text)
415          self.assertEqual(named_entities,
416                           [('http://example.com/everyone', None,
417                             Token(word='everyone', start=6, end=14,
@@ -75,13 +78,13 @@
418                                             sentence=Sentence(indice=1, start=38, end=46)))])
419 
420      def test_nerdy_process_multisources(self):
421          """ Test nerdy process """
422          text = 'Hello everyone, this is   me speaking. And me.'
423 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
424 -                                          'me': 'http://example.com/me'})
425 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
426 +        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
427 +                                    'me': 'http://example.com/me'})
428 +        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
429          # Two sources, not unique
430          nerdy = core.NerdyProcess((source1, source2))
431          named_entities = nerdy.process_text(text)
432          self.assertEqual(named_entities,
433                           [('http://example.com/everyone', None,
@@ -127,13 +130,13 @@
434                                             sentence=Sentence(indice=1, start=38, end=46)))])
435 
436      def test_nerdy_process_add_sources(self):
437          """ Test nerdy process """
438          text = 'Hello everyone, this is   me speaking. And me.'
439 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
440 -                                          'me': 'http://example.com/me'})
441 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
442 +        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
443 +                                    'me': 'http://example.com/me'})
444 +        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
445          nerdy = core.NerdyProcess((source1,))
446          named_entities = nerdy.process_text(text)
447          self.assertEqual(named_entities,
448                           [('http://example.com/everyone', None,
449                             Token(word='everyone', start=6, end=14,
@@ -165,12 +168,12 @@
450                                             sentence=Sentence(indice=1, start=38, end=46)))])
451 
452      def test_nerdy_process_preprocess(self):
453          """ Test nerdy process """
454          text = 'Hello Toto, this is   me speaking. And me.'
455 -        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
456 -                                          'me': 'http://example.com/me'})
457 +        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
458 +                                   'me': 'http://example.com/me'})
459          preprocessor = core.NerdyStopwordsFilterPreprocessor()
460          nerdy = core.NerdyProcess((source,),
461                                    preprocessors=(preprocessor,))
462          named_entities = nerdy.process_text(text)
463          self.assertEqual(named_entities, [('http://example.com/toto', None,
@@ -178,12 +181,12 @@
464                                                   sentence=Sentence(indice=0, start=0, end=34)))])
465 
466      def test_nerdy_process_add_preprocess(self):
467          """ Test nerdy process """
468          text = 'Hello Toto, this is   me speaking. And me.'
469 -        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
470 -                                          'me': 'http://example.com/me'})
471 +        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
472 +                                   'me': 'http://example.com/me'})
473          preprocessor = core.NerdyStopwordsFilterPreprocessor()
474          nerdy = core.NerdyProcess((source,),)
475          named_entities = nerdy.process_text(text)
476          self.assertEqual(named_entities,
477                           [('http://example.com/toto', None,
@@ -202,13 +205,13 @@
478                                                   sentence=Sentence(indice=0, start=0, end=34)))])
479 
480      def test_nerdy_process_chained_word(self):
481          """ Test nerdy process """
482          text = 'Hello everyone me, this is   me speaking. And me.'
483 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
484 -                                          'everyone me': 'http://example.com/everyone_me',
485 -                                          'me': 'http://example.com/me'})
486 +        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
487 +                                   'everyone me': 'http://example.com/everyone_me',
488 +                                   'me': 'http://example.com/me'})
489          nerdy = core.NerdyProcess((source,))
490          named_entities = nerdy.process_text(text)
491          self.assertEqual(named_entities,
492                           [('http://example.com/everyone_me', None,
493                             Token(word='everyone me', start=6, end=17,
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
@@ -15,11 +15,11 @@
494  #
495  # You should have received a copy of the GNU Lesser General Public License along
496  # with this program. If not, see <http://www.gnu.org/licenses/>.
497  import unittest2
498 
499 -from nerdy.tokenizer import RichStringTokenizer, Token, Sentence
500 +from nazca.utils.tokenizer import RichStringTokenizer, Token, Sentence
501 
502 
503  class TokenizerTest(unittest2.TestCase):
504      """ Test of tokenizer """
505 
diff --git a/utils/dataio.py b/utils/dataio.py
@@ -101,11 +101,10 @@
506      indexes form. If indexes is empty, keep raw output"""
507 
508      if not SPARQL_ENABLED:
509          raise ImportError("You have to install SPARQLWrapper and JSON modules to"
510                            "used this function")
511 -
512      sparql = SPARQLWrapper(endpoint)
513      sparql.setQuery(query)
514      sparql.setReturnFormat(JSON)
515      rawresults = sparql.query().convert()
516      labels = rawresults['head']['vars']
diff --git a/utils/tokenizer.py b/utils/tokenizer.py
@@ -0,0 +1,66 @@
517 +# -*- coding: utf-8 -*-
518 +""" Tokenizer for sentences/words segmentation.
519 +"""
520 +import itertools
521 +import collections
522 +import re
523 +
524 +
525 +Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
526 +Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
527 +
528 +
529 +class RichStringTokenizer(object):
530 +    """Tokenizer for Yams' RichString content.
531 +
532 +    The tokenizer uses a variable-length sliding window, i.e. a sliding
533 +    window yielding tokens of N words.
534 +    """
535 +
536 +    def __init__(self, text, token_min_size=1, token_max_size=3):
537 +        """
538 +        :token_min_size: minimum number of words required to be a valid token
539 +        :token_max_size: minimum number of words required to be a valid token
540 +        """
541 +        self.text = text
542 +        self.token_min_size = token_min_size
543 +        self.token_max_size = token_max_size
544 +
545 +    def iter_tokens(self, text):
546 +        """ Iterate tokens over a text
547 +        """
548 +        # Compute sentences
549 +        sentences = self.find_sentences(text)
550 +        # Compute words
551 +        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
552 +        indice = 0
553 +        while indice < len(words):
554 +            # Choose the current sentence of the first word
555 +            current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
556 +            # Sliding windows over the different words for each sentence
557 +            remaining = len(words) - indice
558 +            for length in range(min(self.token_max_size, remaining), self.token_min_size-1, -1):
559 +                _words = words[indice:indice+length]
560 +                if _words[-1].start() > current_sentence.end:
561 +                    # The last word in not in the same sentence anymore, split
562 +                    continue
563 +                normalized_word = ' '.join([w.group() for w in _words]).strip()
564 +                yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
565 +            indice += 1
566 +
567 +    def find_sentences(self, text):
568 +        """ Find the sentences
569 +        """
570 +        return [Sentence(ind, s.start(), s.end()) for ind, s in
571 +                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
572 +
573 +    def load_text(self, text):
574 +        """ Load the text to be tokenized
575 +        """
576 +        self.text = text
577 +
578 +    def __iter__(self):
579 +        """ Iterator over the text given in the object instantiation
580 +        """
581 +        for t in self.iter_tokens(self.text):
582 +            yield t