[utils] Extract words splitting regular expression for easier overridding

authorDenis Laxalde <denis.laxalde@logilab.fr>
changeset9d9d8c4f2bab
branchdefault
phasepublic
hiddenno
parent revision#a507ff7a2ced Added tag nazca-version-0.6.1, nazca-debian-version-0.6.1-1, nazca-centos-version-0.6.1-1 for changeset 4f38a39d47e0
child revision#30af4456d4b0 [utils] Use sentences delimiter from NLTK, #315657283bb0 [utils] Use sentences delimiter from NLTK, #004224904efa [pandas] Add support of Pandas, closes #248556, #9a20f1b7e6a8 [utils] Use sentences delimiter from NLTK
files modified by this revision
utils/tokenizer.py
# HG changeset patch
# User Denis Laxalde <denis.laxalde@logilab.fr>
# Date 1406888726 -7200
# Fri Aug 01 12:25:26 2014 +0200
# Node ID 9d9d8c4f2babb9df1327df03f22d0ff9365bad3d
# Parent a507ff7a2ced8cd4084c9b1cb18c9288ece80daf
[utils] Extract words splitting regular expression for easier overridding

diff --git a/utils/tokenizer.py b/utils/tokenizer.py
@@ -24,17 +24,19 @@
1          """
2          self.text = text
3          self.token_min_size = token_min_size
4          self.token_max_size = token_max_size
5 
6 +    words_re = r'[\w@-]+'
7 +
8      def iter_tokens(self, text):
9          """ Iterate tokens over a text
10          """
11          # Compute sentences
12          sentences = self.find_sentences(text)
13          # Compute words
14 -        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
15 +        words = [m for m in re.finditer(self.words_re, text, re.UNICODE)]
16          indice = 0
17          while indice < len(words):
18              # Choose the current sentence of the first word
19              current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
20              # Sliding windows over the different words for each sentence