[utils] Use sentences delimiter from NLTK

nltk.tokenize.punkt module has a more powerful sentence tokenizer, it handles single capital letters followed by a dot in particular.

NLTK is added as a dependency because the new sentence delimiter does not yield the same results as the old one, even in simple cases. In particular, leading spaces between sentences are no longer part of tokenized sentences now. All tests have been adjusted in this respect.

Note that there's currently no Debian package for NLTK, though there seems to be some hope: see https://bugs.debian.org/279422.

Closes #198624.

authorDenis Laxalde <denis.laxalde@logilab.fr>
changeset315657283bb0
branchdefault
phasedraft
hiddenyes
parent revision#9d9d8c4f2bab [utils] Extract words splitting regular expression for easier overridding
child revision#9442efe09ec0 [pkg] Add missing dependencies on scipy and scikit-learn in __pkginfo__ and debian/control
files modified by this revision
debian/control
python-nazca.spec
test/test_filters.py
test/test_ner.py
test/test_tokenizer.py
utils/tokenizer.py
# HG changeset patch
# User Denis Laxalde <denis.laxalde@logilab.fr>
# Date 1406888800 -7200
# Fri Aug 01 12:26:40 2014 +0200
# Node ID 315657283bb0f3b5d7e61291b03c667bfc155225
# Parent 9d9d8c4f2babb9df1327df03f22d0ff9365bad3d
[utils] Use sentences delimiter from NLTK

`nltk.tokenize.punkt` module has a more powerful sentence tokenizer, it
handles single capital letters followed by a dot in particular.

NLTK is added as a dependency because the new sentence delimiter does not
yield the same results as the old one, even in simple cases. In particular,
leading spaces between sentences are no longer part of tokenized sentences
now. All tests have been adjusted in this respect.

Note that there's currently no Debian package for NLTK, though there seems to
be some hope: see https://bugs.debian.org/279422.

Closes #198624.

diff --git a/debian/control b/debian/control
@@ -7,8 +7,10 @@
1  XS-Python-Version: >= 2.5
2 
3  Package: nazca
4  Architecture: all
5  Depends: ${python:Depends}
6 +Recommends:
7 +  python-nltk (>= 3.0)
8  Description: Python library for data alignment.
9   Nazca is a python library that provides a set of alignment helpers
10   .
diff --git a/python-nazca.spec b/python-nazca.spec
@@ -48,6 +48,6 @@
11  rm -rf $RPM_BUILD_ROOT
12 
13 
14  %files
15  %defattr(-,root,root,-)
16 -/*
17 \ No newline at end of file
18 +/*
diff --git a/test/test_filters.py b/test/test_filters.py
@@ -49,14 +49,14 @@
19                            ('http://example2.com/me', None,
20                             Token(word='me', start=26, end=28,
21                                             sentence=Sentence(indice=0, start=0, end=38))),
22                            ('http://example.com/me', None,
23                             Token(word='me', start=43, end=45,
24 -                                           sentence=Sentence(indice=1, start=38, end=46))),
25 +                                           sentence=Sentence(indice=1, start=39, end=46))),
26                            ('http://example2.com/me', None,
27                             Token(word='me', start=43, end=45,
28 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
29 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
30 
31      def test_occurence_filter_max_occ(self):
32          """ Test occurence filter """
33          text = 'Hello everyone, this is   me speaking. And me.'
34          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
@@ -82,11 +82,11 @@
35                           [('http://example.com/toto_tutu', None,
36                             Token(word='toto tutu', start=6, end=15,
37                                   sentence=Sentence(indice=0, start=0, end=16))),
38                            ('http://example.com/toto_tutu', None,
39                             Token(word='toto', start=21, end=25,
40 -                                 sentence=Sentence(indice=1, start=16, end=26)))])
41 +                                 sentence=Sentence(indice=1, start=17, end=26)))])
42 
43      def test_disambiguation_word_case(self):
44          """ Test occurence filter """
45          text = 'Hello Toto Tutu. And Toto.'
46          source = NerSourceLexicon({'Toto Tutu': 'http://example.com/toto_tutu',
@@ -98,11 +98,11 @@
47                           [('http://example.com/toto_tutu', None,
48                             Token(word='Toto Tutu', start=6, end=15,
49                                   sentence=Sentence(indice=0, start=0, end=16))),
50                            ('http://example.com/toto_tutu', None,
51                             Token(word='Toto', start=21, end=25,
52 -                                 sentence=Sentence(indice=1, start=16, end=26)))])
53 +                                 sentence=Sentence(indice=1, start=17, end=26)))])
54 
55      def test_rules_filter(self):
56          """ Test rules filter """
57          text = 'Hello toto tutu. And toto.'
58          source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
@@ -115,10 +115,10 @@
59                           [('http://example.com/toto_tutu', None,
60                             Token(word='toto tutu', start=6, end=15,
61                                   sentence=Sentence(indice=0, start=0, end=16))),
62                            ('http://example.com/tata', None,
63                             Token(word='toto', start=21, end=25,
64 -                                 sentence=Sentence(indice=1, start=16, end=26)))])
65 +                                 sentence=Sentence(indice=1, start=17, end=26)))])
66 
67  if __name__ == '__main__':
68      unittest.main()
69 
diff --git a/test/test_ner.py b/test/test_ner.py
@@ -24,11 +24,11 @@
70 
71  from nazca.ner.sources import (NerSourceLexicon,
72                                            NerSourceSparql,
73                                            NerSourceRql)
74  from nazca.ner import NerProcess
75 -from nazca.utils.tokenizer import Token, Sentence
76 +from nazca.utils.tokenizer import Token, Sentence, NLTK_AVAILABLE
77  from nazca.ner.preprocessors import NerStopwordsFilterPreprocessor
78 
79 
80  class NerTest(unittest.TestCase):
81      """ Test of Ner """
@@ -66,10 +66,12 @@
82                            u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
83                            u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
84 
85      def test_ner_process(self):
86          """ Test ner process """
87 +        if not NLTK_AVAILABLE:
88 +            self.skipTest("nltk is not available, can't parse sentences")
89          text = 'Hello everyone, this is   me speaking. And me.'
90          source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
91                                     'me': 'http://example.com/me'})
92          ner = NerProcess((source,))
93          named_entities = ner.process_text(text)
@@ -80,14 +82,16 @@
94                            ('http://example.com/me', None,
95                             Token(word='me', start=26, end=28,
96                                             sentence=Sentence(indice=0, start=0, end=38))),
97                            ('http://example.com/me', None,
98                             Token(word='me', start=43, end=45,
99 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
100 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
101 
102      def test_ner_process_multisources(self):
103          """ Test ner process """
104 +        if not NLTK_AVAILABLE:
105 +            self.skipTest("nltk is not available, can't parse sentences")
106          text = 'Hello everyone, this is   me speaking. And me.'
107          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
108                                      'me': 'http://example.com/me'})
109          source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
110          # Two sources, not unique
@@ -103,14 +107,14 @@
111                            ('http://example2.com/me', None,
112                             Token(word='me', start=26, end=28,
113                                             sentence=Sentence(indice=0, start=0, end=38))),
114                            ('http://example.com/me', None,
115                             Token(word='me', start=43, end=45,
116 -                                           sentence=Sentence(indice=1, start=38, end=46))),
117 +                                           sentence=Sentence(indice=1, start=39, end=46))),
118                            ('http://example2.com/me', None,
119                             Token(word='me', start=43, end=45,
120 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
121 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
122          # Two sources, unique
123          ner = NerProcess((source1, source2), unique=True)
124          named_entities = ner.process_text(text)
125          self.assertEqual(named_entities,
126                           [('http://example.com/everyone', None,
@@ -119,11 +123,11 @@
127                            ('http://example.com/me', None,
128                             Token(word='me', start=26, end=28,
129                                             sentence=Sentence(indice=0, start=0, end=38))),
130                            ('http://example.com/me', None,
131                             Token(word='me', start=43, end=45,
132 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
133 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
134          # Two sources inversed, unique
135          ner = NerProcess((source2, source1), unique=True)
136          named_entities = ner.process_text(text)
137          self.assertEqual(named_entities,
138                           [('http://example.com/everyone', None,
@@ -132,14 +136,16 @@
139                            ('http://example2.com/me', None,
140                             Token(word='me', start=26, end=28,
141                                             sentence=Sentence(indice=0, start=0, end=38))),
142                            ('http://example2.com/me', None,
143                             Token(word='me', start=43, end=45,
144 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
145 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
146 
147      def test_ner_process_add_sources(self):
148          """ Test ner process """
149 +        if not NLTK_AVAILABLE:
150 +            self.skipTest("nltk is not available, can't parse sentences")
151          text = 'Hello everyone, this is   me speaking. And me.'
152          source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
153                                      'me': 'http://example.com/me'})
154          source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
155          ner = NerProcess((source1,))
@@ -151,11 +157,11 @@
156                            ('http://example.com/me', None,
157                             Token(word='me', start=26, end=28,
158                                             sentence=Sentence(indice=0, start=0, end=38))),
159                            ('http://example.com/me', None,
160                             Token(word='me', start=43, end=45,
161 -                                           sentence=Sentence(indice=1, start=38, end=46))),])
162 +                                           sentence=Sentence(indice=1, start=39, end=46))),])
163          # Two sources, not unique
164          ner.add_ner_source(source2)
165          named_entities = ner.process_text(text)
166          self.assertEqual(named_entities,
167                           [('http://example.com/everyone', None,
@@ -167,17 +173,19 @@
168                            ('http://example2.com/me', None,
169                             Token(word='me', start=26, end=28,
170                                             sentence=Sentence(indice=0, start=0, end=38))),
171                            ('http://example.com/me', None,
172                             Token(word='me', start=43, end=45,
173 -                                           sentence=Sentence(indice=1, start=38, end=46))),
174 +                                           sentence=Sentence(indice=1, start=39, end=46))),
175                            ('http://example2.com/me', None,
176                             Token(word='me', start=43, end=45,
177 -                                           sentence=Sentence(indice=1, start=38, end=46)))])
178 +                                           sentence=Sentence(indice=1, start=39, end=46)))])
179 
180      def test_ner_process_preprocess(self):
181          """ Test ner process """
182 +        if not NLTK_AVAILABLE:
183 +            self.skipTest("nltk is not available, can't parse sentences")
184          text = 'Hello Toto, this is   me speaking. And me.'
185          source = NerSourceLexicon({'Toto': 'http://example.com/toto',
186                                     'me': 'http://example.com/me'})
187          preprocessor = NerStopwordsFilterPreprocessor()
188          ner = NerProcess((source,),
@@ -187,10 +195,12 @@
189                                             Token(word='Toto', start=6, end=10,
190                                                   sentence=Sentence(indice=0, start=0, end=34)))])
191 
192      def test_ner_process_add_preprocess(self):
193          """ Test ner process """
194 +        if not NLTK_AVAILABLE:
195 +            self.skipTest("nltk is not available, can't parse sentences")
196          text = 'Hello Toto, this is   me speaking. And me.'
197          source = NerSourceLexicon({'Toto': 'http://example.com/toto',
198                                     'me': 'http://example.com/me'})
199          preprocessor = NerStopwordsFilterPreprocessor()
200          ner = NerProcess((source,),)
@@ -202,19 +212,21 @@
201                            ('http://example.com/me', None,
202                             Token(word='me', start=22, end=24,
203                                   sentence=Sentence(indice=0, start=0, end=34))),
204                            ('http://example.com/me', None,
205                             Token(word='me', start=39, end=41,
206 -                                 sentence=Sentence(indice=1, start=34, end=42)))])
207 +                                 sentence=Sentence(indice=1, start=35, end=42)))])
208          ner.add_preprocessors(preprocessor)
209          named_entities = ner.process_text(text)
210          self.assertEqual(named_entities, [('http://example.com/toto', None,
211                                             Token(word='Toto', start=6, end=10,
212                                                   sentence=Sentence(indice=0, start=0, end=34)))])
213 
214      def test_ner_process_chained_word(self):
215          """ Test ner process """
216 +        if not NLTK_AVAILABLE:
217 +            self.skipTest("nltk is not available, can't parse sentences")
218          text = 'Hello everyone me, this is   me speaking. And me.'
219          source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
220                                     'everyone me': 'http://example.com/everyone_me',
221                                     'me': 'http://example.com/me'})
222          ner = NerProcess((source,))
@@ -225,11 +237,12 @@
223                                   sentence=Sentence(indice=0, start=0, end=41))),
224                            ('http://example.com/me', None,
225                             Token(word='me', start=29, end=31,
226                                   sentence=Sentence(indice=0, start=0, end=41))),
227                            ('http://example.com/me', None,
228 -                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
229 +                           Token(word='me', start=46, end=48,
230 +                                 sentence=Sentence(indice=1, start=42, end=49)))])
231 
232 
233  if __name__ == '__main__':
234      unittest.main()
235 
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
@@ -26,20 +26,26 @@
236 
237 
238  class TokenizerTest(unittest.TestCase):
239      """ Test of tokenizer """
240 
241 +    def test_find_sentences(self):
242 +        text = 'Hello everyone, this is   me speaking. And me.'
243 +        sentences = RichStringTokenizer.find_sentences(text)
244 +        self.assertEqual(sentences[0], Sentence(indice=0, start=0, end=38))
245 +        self.assertEqual(sentences[1], Sentence(indice=1, start=39, end=46))
246 +
247      def test_richstringtokenizer(self):
248          text = 'Hello everyone, this is   me speaking. And me.'
249          tokenizer = RichStringTokenizer(text,
250                                          token_min_size=1,
251                                          token_max_size=3)
252          tokens = list(tokenizer)
253          self.assertEqual(len(tokens), 18)
254          t1 = Token(word='Hello everyone this', start=0, end=20, sentence=Sentence(indice=0, start=0, end=38))
255          self.assertEqual(tokens[0], t1)
256 -        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=38, end=46))
257 +        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=39, end=46))
258          self.assertEqual(tokens[16], t2)
259 
260      def test_richstringtokenizer_loadtext(self):
261          text = 'Hello everyone, this is   me speaking. And me.'
262          tokenizer = RichStringTokenizer(text,
@@ -66,28 +72,28 @@
263          tokenizer = RichStringTokenizer(text,
264                                          token_min_size=1,
265                                          token_max_size=4)
266          tokens = list(tokenizer)
267          self.assertEqual(len(tokens), 21)
268 -        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=38, end=46))
269 +        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=39, end=46))
270          self.assertEqual(tokens[18], t1)
271 
272      def test_richstringtokenizer_sentences(self):
273 -        text = 'Hello everyone, this is   me speaking. And me !Why not me ? Blup'
274 +        text = 'Hello everyone, this is   me speaking. And me ! Why not me ? Blup'
275          tokenizer = RichStringTokenizer(text,
276                                          token_min_size=1,
277                                          token_max_size=4)
278          sentences = tokenizer.find_sentences(text)
279          self.assertEqual(len(sentences), 4)
280          self.assertEqual(text[sentences[0].start:sentences[0].end],
281                           'Hello everyone, this is   me speaking.')
282          self.assertEqual(text[sentences[1].start:sentences[1].end],
283 -                         ' And me !')
284 +                         'And me !')
285          self.assertEqual(text[sentences[2].start:sentences[2].end],
286                           'Why not me ?')
287          self.assertEqual(text[sentences[3].start:sentences[3].end],
288 -                         ' Blup')
289 +                         'Blup')
290 
291 
292  if __name__ == '__main__':
293      unittest.main()
294 
diff --git a/utils/tokenizer.py b/utils/tokenizer.py
@@ -3,10 +3,16 @@
295  """
296  import itertools
297  import collections
298  import re
299 
300 +try:
301 +    from nltk.tokenize.punkt import PunktSentenceTokenizer
302 +except ImportError:
303 +    NLTK_AVAILABLE = False
304 +else:
305 +    NLTK_AVAILABLE = True
306 
307  Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
308  Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
309 
310 
@@ -48,15 +54,19 @@
311                      continue
312                  normalized_word = ' '.join([w.group() for w in _words]).strip()
313                  yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
314              indice += 1
315 
316 -    def find_sentences(self, text):
317 +    @staticmethod
318 +    def find_sentences(text):
319          """ Find the sentences
320          """
321 -        return [Sentence(ind, s.start(), s.end()) for ind, s in
322 -                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
323 +        if not NLTK_AVAILABLE:
324 +            raise RuntimeError("find_sentences requires NLTK to be installed")
325 +        sentences = PunktSentenceTokenizer().span_tokenize(text)
326 +        return [Sentence(ind, start, end)
327 +                for ind, (start, end) in enumerate(sentences)]
328 
329      def load_text(self, text):
330          """ Load the text to be tokenized
331          """
332          self.text = text