[utils] Use sentences delimiter from NLTK

nltk.tokenize.punkt module has a more powerful sentence tokenizer, it handles single capital letters followed by a dot in particular.

NLTK is added as a dependency because the new sentence delimiter does not yield the same results as the old one, even in simple cases. In particular, leading spaces between sentences are no longer part of tokenized sentences now. All tests have been adjusted in this respect.

Note that there's currently no Debian package for NLTK, though there seems to be some hope: see https://bugs.debian.org/279422.

Closes #198624.

authorDenis Laxalde <denis.laxalde@logilab.fr>
changeset30af4456d4b0
branchdefault
phasepublic
hiddenno
parent revision#9d9d8c4f2bab [utils] Extract words splitting regular expression for easier overridding
child revision#1382bb5550ca [pkg] Add missing dependencies on scipy and scikit-learn in __pkginfo__ and debian/control
files modified by this revision
debian/control
python-nazca.spec
test/test_filters.py
test/test_ner.py
test/test_tokenizer.py
utils/tokenizer.py
# HG changeset patch
# User Denis Laxalde <denis.laxalde@logilab.fr>
# Date 1406888800 -7200
# Fri Aug 01 12:26:40 2014 +0200
# Node ID 30af4456d4b0b8832deea58a7bef01bafb6e3158
# Parent 9d9d8c4f2babb9df1327df03f22d0ff9365bad3d
[utils] Use sentences delimiter from NLTK

`nltk.tokenize.punkt` module has a more powerful sentence tokenizer, it
handles single capital letters followed by a dot in particular.

NLTK is added as a dependency because the new sentence delimiter does not
yield the same results as the old one, even in simple cases. In particular,
leading spaces between sentences are no longer part of tokenized sentences
now. All tests have been adjusted in this respect.

Note that there's currently no Debian package for NLTK, though there seems to
be some hope: see https://bugs.debian.org/279422.

Closes #198624.

diff --git a/debian/control b/debian/control
@@ -7,8 +7,10 @@
1  XS-Python-Version: >= 2.5
2 
3  Package: nazca
4  Architecture: all
5  Depends: ${python:Depends}
6 +Recommends:
7 +  python-nltk (>= 3.0)
8  Description: Python library for data alignment.
9   Nazca is a python library that provides a set of alignment helpers
10   .
diff --git a/python-nazca.spec b/python-nazca.spec
@@ -48,6 +48,6 @@
11  rm -rf $RPM_BUILD_ROOT
12 
13 
14  %files
15  %defattr(-,root,root,-)
16 -/*
17 \ No newline at end of file
18 +/*
diff --git a/test/test_filters.py b/test/test_filters.py
@@ -49,14 +49,14 @@
19                            ('http://example2.com/me', None,
20                             Token(word='me', start=26, end=28,
21                                             sentence=Sentence(indice=0, start=0, end=38))),
22                            ('http://example.com/me', None,
23                             Token(word='me', start=43, end=45,
24 -                                           sentence=Sentence(indice=1, start=38, end=46))),
25 +                                           sentence=Sentence(indice=1, start=39, end=46))),
26                            ('http://example2.com/me', None,
27                             Token(word='me', start=43, end=45,
28 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
29 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
30 
31      def test_occurence_filter_max_occ(self):
32          """ Test occurence filter """
33          text = 'Hello everyone, this is   me speaking. And me.'
34          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
@@ -82,11 +82,11 @@
35                           [('http://example.com/toto_tutu', None,
36                             Token(word='toto tutu', start=6, end=15,
37                                   sentence=Sentence(indice=0, start=0, end=16))),
38                            ('http://example.com/toto_tutu', None,
39                             Token(word='toto', start=21, end=25,
40 -                                 sentence=Sentence(indice=1, start=16, end=26)))])
41 +                                 sentence=Sentence(indice=1, start=17, end=26)))])
42 
43      def test_disambiguation_word_case(self):
44          """ Test occurence filter """
45          text = 'Hello Toto Tutu. And Toto.'
46          source = NerSourceLexicon({'Toto Tutu': 'http://example.com/toto_tutu',
@@ -98,11 +98,11 @@
47                           [('http://example.com/toto_tutu', None,
48                             Token(word='Toto Tutu', start=6, end=15,
49                                   sentence=Sentence(indice=0, start=0, end=16))),
50                            ('http://example.com/toto_tutu', None,
51                             Token(word='Toto', start=21, end=25,
52 -                                 sentence=Sentence(indice=1, start=16, end=26)))])
53 +                                 sentence=Sentence(indice=1, start=17, end=26)))])
54 
55      def test_rules_filter(self):
56          """ Test rules filter """
57          text = 'Hello toto tutu. And toto.'
58          source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
@@ -115,10 +115,10 @@
59                           [('http://example.com/toto_tutu', None,
60                             Token(word='toto tutu', start=6, end=15,
61                                   sentence=Sentence(indice=0, start=0, end=16))),
62                            ('http://example.com/tata', None,
63                             Token(word='toto', start=21, end=25,
64 -                                 sentence=Sentence(indice=1, start=16, end=26)))])
65 +                                 sentence=Sentence(indice=1, start=17, end=26)))])
66 
67  if __name__ == '__main__':
68      unittest.main()
69 
diff --git a/test/test_ner.py b/test/test_ner.py
@@ -24,11 +24,11 @@
70 
71  from nazca.ner.sources import (NerSourceLexicon,
72                                            NerSourceSparql,
73                                            NerSourceRql)
74  from nazca.ner import NerProcess
75 -from nazca.utils.tokenizer import Token, Sentence
76 +from nazca.utils.tokenizer import Token, Sentence, NLTK_AVAILABLE
77  from nazca.ner.preprocessors import NerStopwordsFilterPreprocessor
78 
79 
80  class NerTest(unittest.TestCase):
81      """ Test of Ner """
@@ -64,10 +64,11 @@
82          self.assertEqual(source.query_word('Python'),
83                           [u'http://dbpedia.org/resource/Python',
84                            u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
85                            u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
86 
87 +    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
88      def test_ner_process(self):
89          """ Test ner process """
90          text = 'Hello everyone, this is   me speaking. And me.'
91          source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
92                                     'me': 'http://example.com/me'})
@@ -80,12 +81,13 @@
93                            ('http://example.com/me', None,
94                             Token(word='me', start=26, end=28,
95                                             sentence=Sentence(indice=0, start=0, end=38))),
96                            ('http://example.com/me', None,
97                             Token(word='me', start=43, end=45,
98 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
99 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
100 
101 +    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
102      def test_ner_process_multisources(self):
103          """ Test ner process """
104          text = 'Hello everyone, this is   me speaking. And me.'
105          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
106                                      'me': 'http://example.com/me'})
@@ -103,14 +105,14 @@
107                            ('http://example2.com/me', None,
108                             Token(word='me', start=26, end=28,
109                                             sentence=Sentence(indice=0, start=0, end=38))),
110                            ('http://example.com/me', None,
111                             Token(word='me', start=43, end=45,
112 -                                           sentence=Sentence(indice=1, start=38, end=46))),
113 +                                           sentence=Sentence(indice=1, start=39, end=46))),
114                            ('http://example2.com/me', None,
115                             Token(word='me', start=43, end=45,
116 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
117 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
118          # Two sources, unique
119          ner = NerProcess((source1, source2), unique=True)
120          named_entities = ner.process_text(text)
121          self.assertEqual(named_entities,
122                           [('http://example.com/everyone', None,
@@ -119,11 +121,11 @@
123                            ('http://example.com/me', None,
124                             Token(word='me', start=26, end=28,
125                                             sentence=Sentence(indice=0, start=0, end=38))),
126                            ('http://example.com/me', None,
127                             Token(word='me', start=43, end=45,
128 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
129 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
130          # Two sources inversed, unique
131          ner = NerProcess((source2, source1), unique=True)
132          named_entities = ner.process_text(text)
133          self.assertEqual(named_entities,
134                           [('http://example.com/everyone', None,
@@ -132,12 +134,13 @@
135                            ('http://example2.com/me', None,
136                             Token(word='me', start=26, end=28,
137                                             sentence=Sentence(indice=0, start=0, end=38))),
138                            ('http://example2.com/me', None,
139                             Token(word='me', start=43, end=45,
140 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
141 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
142 
143 +    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
144      def test_ner_process_add_sources(self):
145          """ Test ner process """
146          text = 'Hello everyone, this is   me speaking. And me.'
147          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
148                                      'me': 'http://example.com/me'})
@@ -151,11 +154,11 @@
149                            ('http://example.com/me', None,
150                             Token(word='me', start=26, end=28,
151                                             sentence=Sentence(indice=0, start=0, end=38))),
152                            ('http://example.com/me', None,
153                             Token(word='me', start=43, end=45,
154 -                                           sentence=Sentence(indice=1, start=38, end=46))),])
155 +                                           sentence=Sentence(indice=1, start=39, end=46))),])
156          # Two sources, not unique
157          ner.add_ner_source(source2)
158          named_entities = ner.process_text(text)
159          self.assertEqual(named_entities,
160                           [('http://example.com/everyone', None,
@@ -167,15 +170,16 @@
161                            ('http://example2.com/me', None,
162                             Token(word='me', start=26, end=28,
163                                             sentence=Sentence(indice=0, start=0, end=38))),
164                            ('http://example.com/me', None,
165                             Token(word='me', start=43, end=45,
166 -                                           sentence=Sentence(indice=1, start=38, end=46))),
167 +                                           sentence=Sentence(indice=1, start=39, end=46))),
168                            ('http://example2.com/me', None,
169                             Token(word='me', start=43, end=45,
170 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
171 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
172 
173 +    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
174      def test_ner_process_preprocess(self):
175          """ Test ner process """
176          text = 'Hello Toto, this is   me speaking. And me.'
177          source = NerSourceLexicon({'Toto': 'http://example.com/toto',
178                                     'me': 'http://example.com/me'})
@@ -185,10 +189,11 @@
179          named_entities = ner.process_text(text)
180          self.assertEqual(named_entities, [('http://example.com/toto', None,
181                                             Token(word='Toto', start=6, end=10,
182                                                   sentence=Sentence(indice=0, start=0, end=34)))])
183 
184 +    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
185      def test_ner_process_add_preprocess(self):
186          """ Test ner process """
187          text = 'Hello Toto, this is   me speaking. And me.'
188          source = NerSourceLexicon({'Toto': 'http://example.com/toto',
189                                     'me': 'http://example.com/me'})
@@ -202,17 +207,18 @@
190                            ('http://example.com/me', None,
191                             Token(word='me', start=22, end=24,
192                                   sentence=Sentence(indice=0, start=0, end=34))),
193                            ('http://example.com/me', None,
194                             Token(word='me', start=39, end=41,
195 -                                 sentence=Sentence(indice=1, start=34, end=42)))])
196 +                                 sentence=Sentence(indice=1, start=35, end=42)))])
197          ner.add_preprocessors(preprocessor)
198          named_entities = ner.process_text(text)
199          self.assertEqual(named_entities, [('http://example.com/toto', None,
200                                             Token(word='Toto', start=6, end=10,
201                                                   sentence=Sentence(indice=0, start=0, end=34)))])
202 
203 +    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
204      def test_ner_process_chained_word(self):
205          """ Test ner process """
206          text = 'Hello everyone me, this is   me speaking. And me.'
207          source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
208                                     'everyone me': 'http://example.com/everyone_me',
@@ -225,11 +231,12 @@
209                                   sentence=Sentence(indice=0, start=0, end=41))),
210                            ('http://example.com/me', None,
211                             Token(word='me', start=29, end=31,
212                                   sentence=Sentence(indice=0, start=0, end=41))),
213                            ('http://example.com/me', None,
214 -                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
215 +                           Token(word='me', start=46, end=48,
216 +                                 sentence=Sentence(indice=1, start=42, end=49)))])
217 
218 
219  if __name__ == '__main__':
220      unittest.main()
221 
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
@@ -26,20 +26,26 @@
222 
223 
224  class TokenizerTest(unittest.TestCase):
225      """ Test of tokenizer """
226 
227 +    def test_find_sentences(self):
228 +        text = 'Hello everyone, this is   me speaking. And me.'
229 +        sentences = RichStringTokenizer.find_sentences(text)
230 +        self.assertEqual(sentences[0], Sentence(indice=0, start=0, end=38))
231 +        self.assertEqual(sentences[1], Sentence(indice=1, start=39, end=46))
232 +
233      def test_richstringtokenizer(self):
234          text = 'Hello everyone, this is   me speaking. And me.'
235          tokenizer = RichStringTokenizer(text,
236                                          token_min_size=1,
237                                          token_max_size=3)
238          tokens = list(tokenizer)
239          self.assertEqual(len(tokens), 18)
240          t1 = Token(word='Hello everyone this', start=0, end=20, sentence=Sentence(indice=0, start=0, end=38))
241          self.assertEqual(tokens[0], t1)
242 -        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=38, end=46))
243 +        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=39, end=46))
244          self.assertEqual(tokens[16], t2)
245 
246      def test_richstringtokenizer_loadtext(self):
247          text = 'Hello everyone, this is   me speaking. And me.'
248          tokenizer = RichStringTokenizer(text,
@@ -66,28 +72,28 @@
249          tokenizer = RichStringTokenizer(text,
250                                          token_min_size=1,
251                                          token_max_size=4)
252          tokens = list(tokenizer)
253          self.assertEqual(len(tokens), 21)
254 -        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=38, end=46))
255 +        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=39, end=46))
256          self.assertEqual(tokens[18], t1)
257 
258      def test_richstringtokenizer_sentences(self):
259 -        text = 'Hello everyone, this is   me speaking. And me !Why not me ? Blup'
260 +        text = 'Hello everyone, this is   me speaking. And me ! Why not me ? Blup'
261          tokenizer = RichStringTokenizer(text,
262                                          token_min_size=1,
263                                          token_max_size=4)
264          sentences = tokenizer.find_sentences(text)
265          self.assertEqual(len(sentences), 4)
266          self.assertEqual(text[sentences[0].start:sentences[0].end],
267                           'Hello everyone, this is   me speaking.')
268          self.assertEqual(text[sentences[1].start:sentences[1].end],
269 -                         ' And me !')
270 +                         'And me !')
271          self.assertEqual(text[sentences[2].start:sentences[2].end],
272                           'Why not me ?')
273          self.assertEqual(text[sentences[3].start:sentences[3].end],
274 -                         ' Blup')
275 +                         'Blup')
276 
277 
278  if __name__ == '__main__':
279      unittest.main()
280 
diff --git a/utils/tokenizer.py b/utils/tokenizer.py
@@ -3,10 +3,16 @@
281  """
282  import itertools
283  import collections
284  import re
285 
286 +try:
287 +    from nltk.tokenize.punkt import PunktSentenceTokenizer
288 +except ImportError:
289 +    NLTK_AVAILABLE = False
290 +else:
291 +    NLTK_AVAILABLE = True
292 
293  Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
294  Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
295 
296 
@@ -48,15 +54,19 @@
297                      continue
298                  normalized_word = ' '.join([w.group() for w in _words]).strip()
299                  yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
300              indice += 1
301 
302 -    def find_sentences(self, text):
303 +    @staticmethod
304 +    def find_sentences(text):
305          """ Find the sentences
306          """
307 -        return [Sentence(ind, s.start(), s.end()) for ind, s in
308 -                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
309 +        if not NLTK_AVAILABLE:
310 +            raise RuntimeError("find_sentences requires NLTK to be installed")
311 +        sentences = PunktSentenceTokenizer().span_tokenize(text)
312 +        return [Sentence(ind, start, end)
313 +                for ind, (start, end) in enumerate(sentences)]
314 
315      def load_text(self, text):
316          """ Load the text to be tokenized
317          """
318          self.text = text