[named entities] Move tokenizer to utils and create a sources module for named entities, related to #187461

authorvincent.michel@logilab.fr
changeset77a3a4107f5c
branchdefault
phasepublic
hiddenno
parent revision#4ef3109eab7a [dataio] Merge dataio and tests, related to #187461
child revision#102c6331f3f6 [named entities] Split core into preprocessors and filters modules, related to #187461
files modified by this revision
named_entities/named_entities.py
named_entities/sources.py
named_entities/tokenizer.py
test/test_named_entities.py
test/test_tokenizer.py
utils/dataio.py
utils/tokenizer.py
# HG changeset patch
# User vincent.michel@logilab.fr
# Date 1387464326 0
# Thu Dec 19 14:45:26 2013 +0000
# Node ID 77a3a4107f5c930dca3228af0a0ec75ad224d538
# Parent 4ef3109eab7a048975084088e63ae456e571ff8e
[named entities] Move tokenizer to utils and create a sources module for named entities, related to #187461

diff --git a/named_entities/named_entities.py b/named_entities/named_entities.py
@@ -1,143 +1,15 @@
1  # -*- coding: utf-8 -*-
2  """ Core functions for Named Entities Recognition.
3  """
4 -from nerdy.tokenizer import RichStringTokenizer, Token
5 -from nerdy.dataio import sparql_query, rql_url_query, rql_appid_query
6 -from nerdy.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
7 +from nazca.utils.tokenizer import RichStringTokenizer, Token
8 +from nazca.utils.dataio import sparqlquery
9 +from nazca.reference_data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
10 
11  STOPWORDS = {'fr': FRENCH_STOPWORDS,
12               'en': ENGLISH_STOPWORDS}
13 
14 -# XXX Add SQL source ?
15 -# XXX NER preprocessor
16 -
17 -###############################################################################
18 -### NER SOURCE ################################################################
19 -###############################################################################
20 -class AbstractNerdySource(object):
21 -    """ High-level source for Named Entities Recognition
22 -    """
23 -
24 -    def __init__(self, query, endpoint, name=None, use_cache=True, preprocessors=None):
25 -        """ Initialise the class.
26 -        """
27 -        self.query = query
28 -        self.endpoint = endpoint
29 -        self.name = name
30 -        self.preprocessors = preprocessors or []
31 -        self.use_cache = use_cache
32 -        self._recognized_cache = {}
33 -
34 -    def add_preprocessors(self, preprocessor):
35 -        """ Add a preprocessor
36 -        """
37 -        self.preprocessors.append(preprocessor)
38 -
39 -    def recognize_token(self, token):
40 -        """ Recognize a token
41 -        """
42 -        # Applies source specific preprocessors
43 -        for preprocessor in self.preprocessors:
44 -            token = preprocessor(token)
45 -            if not token:
46 -                return []
47 -        if self.use_cache and token.word in self._recognized_cache:
48 -            return self._recognized_cache[token.word]
49 -        uris = self.query_word(token.word) if token.word else []
50 -        if self.use_cache:
51 -            self._recognized_cache[token.word] = uris
52 -        return uris
53 -
54 -    def query_word(self, word):
55 -        """ Query a word for a Named Entities Recognition process
56 -        """
57 -        raise NotImplementedError
58 -
59 -
60 -class NerdySourceLexical(AbstractNerdySource):
61 -    """ Source based on a (pre-computed) dictionnary of words (token, uri)
62 -    """
63 -    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
64 -        self.lexicon = lexicon
65 -        self.name = name
66 -        self.preprocessors = preprocessors or []
67 -        self.use_cache = use_cache
68 -        self._recognized_cache = {}
69 -
70 -    def query_word(self, word):
71 -        uri = self.lexicon.get(word)
72 -        return [uri,] if uri else []
73 -
74 -
75 -class NerdySourceLocalRql(AbstractNerdySource):
76 -    """ High-level source for Named Entities Recognition
77 -    Local RQL version
78 -    """
79 -
80 -    def __init__(self, query, session, name=None, use_cache=True, preprocessors=None):
81 -        """ Initialise the class.
82 -        """
83 -        self.query = query
84 -        self.session = session
85 -        self.name = name
86 -        self.preprocessors = preprocessors or []
87 -        self.use_cache = use_cache
88 -        self._recognized_cache = {}
89 -
90 -    def query_word(self, word):
91 -        """ Query a word for a Named Entities Recognition process
92 -        """
93 -        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
94 -
95 -
96 -class NerdySourceAppidRql(AbstractNerdySource):
97 -    """ High-level source for Named Entities Recognition
98 -    Appid RQL version
99 -    """
100 -
101 -    def query_word(self, word):
102 -        """ Query a word for a Named Entities Recognition process
103 -        """
104 -        return [r[0] for r in rql_appid_query(self.query, self.endpoint, word=word)]
105 -
106 -
107 -class NerdySourceUrlRql(AbstractNerdySource):
108 -    """ High-level source for Named Entities Recognition
109 -    Url RQL version
110 -    """
111 -
112 -    def query_word(self, word):
113 -        """ Query a word for a Named Entities Recognition process
114 -        """
115 -        return [r[0] for r in rql_url_query(self.query % {'word': word}, self.endpoint)]
116 -
117 -
118 -class NerdySourceSparql(AbstractNerdySource):
119 -    """ High-level source for Named Entities Recognition
120 -    SPARQL version
121 -
122 -   >>> from nerdy.core import NerdySourceSparql
123 -   >>> ner_source = NerdySourceSparql('''SELECT ?uri
124 -                                         WHERE{
125 -                                         ?uri rdfs:label "%(word)s"@en}''',
126 -			                 'http://dbpedia.org/sparql')
127 -   >>> print ner_source.recognize_token('Victor Hugo')
128 -		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
129 -		     'http://dbpedia.org/resource/Victor_Hugo',
130 -		     'http://dbpedia.org/class/yago/VictorHugo',
131 -		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
132 -		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
133 -		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
134 -
135 -    """
136 -
137 -    def query_word(self, word):
138 -        """ Query a word for a Named Entities Recognition process
139 -        """
140 -        return [r['uri']['value'] for r in sparql_query(self.query % {'word': word}, self.endpoint)]
141 -
142 
143  ###############################################################################
144  ### NER PREPROCESSORS #########################################################
145  ###############################################################################
146  class AbstractNerdyPreprocessor(object):
@@ -259,11 +131,11 @@
147          for uri, p, t in named_entities:
148              if uri in seen_uris:
149                  if seen_uris[uri]:
150                      filtered_named_entities.append((uri, p, t))
151              else:
152 -                results = sparql_query(self.query % {'uri': uri}, self.endpoint)
153 +                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
154                  types = set([r['type']['value'] for r in results])
155                  if not len(types.intersection(self.accepted_types)):
156                      seen_uris[uri] = False
157                  else:
158                      seen_uris[uri] = True
diff --git a/named_entities/sources.py b/named_entities/sources.py
@@ -0,0 +1,124 @@
159 +# -*- coding: utf-8 -*-
160 +""" Core functions for Named Entities Recognition.
161 +"""
162 +from nazca.utils.tokenizer import Token
163 +from nazca.utils.dataio import sparqlquery, rqlquery
164 +
165 +
166 +###############################################################################
167 +### NER SOURCE ################################################################
168 +###############################################################################
169 +class AbstractNerSource(object):
170 +    """ High-level source for Named Entities Recognition
171 +    """
172 +
173 +    def __init__(self, endpoint, query, name=None, use_cache=True, preprocessors=None):
174 +        """ Initialise the class.
175 +        """
176 +        self.endpoint = endpoint
177 +        self.query = query
178 +        self.name = name
179 +        self.preprocessors = preprocessors or []
180 +        self.use_cache = use_cache
181 +        self._recognized_cache = {}
182 +
183 +    def add_preprocessors(self, preprocessor):
184 +        """ Add a preprocessor
185 +        """
186 +        self.preprocessors.append(preprocessor)
187 +
188 +    def recognize_token(self, token):
189 +        """ Recognize a token
190 +        """
191 +        # Applies source specific preprocessors
192 +        for preprocessor in self.preprocessors:
193 +            token = preprocessor(token)
194 +            if not token:
195 +                return []
196 +        if self.use_cache and token.word in self._recognized_cache:
197 +            return self._recognized_cache[token.word]
198 +        uris = self.query_word(token.word) if token.word else []
199 +        if self.use_cache:
200 +            self._recognized_cache[token.word] = uris
201 +        return uris
202 +
203 +    def query_word(self, word):
204 +        """ Query a word for a Named Entities Recognition process
205 +        """
206 +        raise NotImplementedError
207 +
208 +
209 +class NerSourceLexicon(AbstractNerSource):
210 +    """ Source based on a (pre-computed) dictionnary of words (token, uri)
211 +    """
212 +    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
213 +        self.lexicon = lexicon
214 +        self.name = name
215 +        self.preprocessors = preprocessors or []
216 +        self.use_cache = use_cache
217 +        self._recognized_cache = {}
218 +
219 +    def query_word(self, word):
220 +        uri = self.lexicon.get(word)
221 +        return [uri,] if uri else []
222 +
223 +
224 +class NerSourceLocalRql(AbstractNerSource):
225 +    """ High-level source for Named Entities Recognition
226 +    Local RQL version
227 +    """
228 +
229 +    def __init__(self, session, query, name=None, use_cache=True, preprocessors=None):
230 +        """ Initialise the class.
231 +        """
232 +        self.query = query
233 +        self.session = session
234 +        self.name = name
235 +        self.preprocessors = preprocessors or []
236 +        self.use_cache = use_cache
237 +        self._recognized_cache = {}
238 +
239 +    def query_word(self, word):
240 +        """ Query a word for a Named Entities Recognition process
241 +        """
242 +        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
243 +
244 +
245 +class NerSourceRql(AbstractNerSource):
246 +    """ High-level source for Named Entities Recognition
247 +    Url version (distant source)
248 +    """
249 +
250 +    def query_word(self, word):
251 +        """ Query a word for a Named Entities Recognition process
252 +        """
253 +        if self.endpoint.startswith('http://'):
254 +            # url
255 +            return [r[0] for r in rqlquery(self.endpoint, self.query % {'word': word})]
256 +        else:
257 +            return [r[0] for r in rqlquery(self.endpoint, self.query, word=word)]
258 +
259 +
260 +class NerSourceSparql(AbstractNerSource):
261 +    """ High-level source for Named Entities Recognition
262 +    SPARQL version
263 +
264 +   >>> from nerdy.core import NerSourceSparql
265 +   >>> ner_source = NerSourceSparql('''SELECT ?uri
266 +                                         WHERE{
267 +                                         ?uri rdfs:label "%(word)s"@en}''',
268 +			                 'http://dbpedia.org/sparql')
269 +   >>> print ner_source.recognize_token('Victor Hugo')
270 +		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
271 +		     'http://dbpedia.org/resource/Victor_Hugo',
272 +		     'http://dbpedia.org/class/yago/VictorHugo',
273 +		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
274 +		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
275 +		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
276 +
277 +    """
278 +
279 +    def query_word(self, word):
280 +        """ Query a word for a Named Entities Recognition process
281 +        """
282 +        return [r['uri']['value'] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
diff --git a/test/test_named_entities.py b/test/test_named_entities.py
@@ -15,22 +15,25 @@
283  #
284  # You should have received a copy of the GNU Lesser General Public License along
285  # with this program. If not, see <http://www.gnu.org/licenses/>.
286  import unittest2
287 
288 -from nerdy import core
289 -from nerdy.tokenizer import Token, Sentence
290 +from nazca.named_entities.sources import (NerSourceLexicon,
291 +                                          NerSourceSparql,
292 +                                          NerSourceRql)
293 +from nazca.named_entities import named_entities as core
294 +from nazca.utils.tokenizer import Token, Sentence
295 
296 
297  class CoreTest(unittest2.TestCase):
298      """ Test of core """
299 
300 -    def test_lexical_source(self):
301 -        """ Test lexical source """
302 +    def test_lexicon_source(self):
303 +        """ Test lexicon source """
304          lexicon = {'everyone': 'http://example.com/everyone',
305                     'me': 'http://example.com/me'}
306 -        source = core.NerdySourceLexical(lexicon)
307 +        source = NerSourceLexicon(lexicon)
308          self.assertEqual(source.query_word('me'), ['http://example.com/me',])
309          self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
310          self.assertEqual(source.query_word('me everyone'), [])
311          self.assertEqual(source.query_word('toto'), [])
312          # Token
@@ -39,30 +42,30 @@
313          token = Token('ma', 0, 2, None)
314          self.assertEqual(source.recognize_token(token), [])
315 
316      def test_rql_source(self):
317          """ Test rql source """
318 -        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
319 -                                       'http://www.cubicweb.org')
320 +        source = NerSourceRql('http://www.cubicweb.org',
321 +                              'Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"')
322          self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
323 
324      def test_sparql_source(self):
325          """ Test sparql source """
326 -        source = core.NerdySourceSparql(u'''SELECT ?uri
327 -                                            WHERE{
328 -                                            ?uri rdfs:label "Python"@en .
329 -                                            ?uri rdf:type ?type}''',
330 -                                        u'http://dbpedia.org/sparql')
331 +        source = NerSourceSparql(u'http://dbpedia.org/sparql',
332 +                                 u'''SELECT ?uri
333 +                                     WHERE{
334 +                                     ?uri rdfs:label "Python"@en .
335 +                                     ?uri rdf:type ?type}''')
336          self.assertEqual(source.query_word('cubicweb'),
337                           [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
338                            u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
339 
340      def test_nerdy_process(self):
341          """ Test nerdy process """
342          text = 'Hello everyone, this is   me speaking. And me.'
343 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
344 -                                          'me': 'http://example.com/me'})
345 +        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
346 +                                   'me': 'http://example.com/me'})
347          nerdy = core.NerdyProcess((source,))
348          named_entities = nerdy.process_text(text)
349          self.assertEqual(named_entities,
350                           [('http://example.com/everyone', None,
351                             Token(word='everyone', start=6, end=14,
@@ -75,13 +78,13 @@
352                                             sentence=Sentence(indice=1, start=38, end=46)))])
353 
354      def test_nerdy_process_multisources(self):
355          """ Test nerdy process """
356          text = 'Hello everyone, this is   me speaking. And me.'
357 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
358 -                                          'me': 'http://example.com/me'})
359 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
360 +        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
361 +                                    'me': 'http://example.com/me'})
362 +        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
363          # Two sources, not unique
364          nerdy = core.NerdyProcess((source1, source2))
365          named_entities = nerdy.process_text(text)
366          self.assertEqual(named_entities,
367                           [('http://example.com/everyone', None,
@@ -127,13 +130,13 @@
368                                             sentence=Sentence(indice=1, start=38, end=46)))])
369 
370      def test_nerdy_process_add_sources(self):
371          """ Test nerdy process """
372          text = 'Hello everyone, this is   me speaking. And me.'
373 -        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
374 -                                          'me': 'http://example.com/me'})
375 -        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
376 +        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
377 +                                    'me': 'http://example.com/me'})
378 +        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
379          nerdy = core.NerdyProcess((source1,))
380          named_entities = nerdy.process_text(text)
381          self.assertEqual(named_entities,
382                           [('http://example.com/everyone', None,
383                             Token(word='everyone', start=6, end=14,
@@ -165,12 +168,12 @@
384                                             sentence=Sentence(indice=1, start=38, end=46)))])
385 
386      def test_nerdy_process_preprocess(self):
387          """ Test nerdy process """
388          text = 'Hello Toto, this is   me speaking. And me.'
389 -        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
390 -                                          'me': 'http://example.com/me'})
391 +        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
392 +                                   'me': 'http://example.com/me'})
393          preprocessor = core.NerdyStopwordsFilterPreprocessor()
394          nerdy = core.NerdyProcess((source,),
395                                    preprocessors=(preprocessor,))
396          named_entities = nerdy.process_text(text)
397          self.assertEqual(named_entities, [('http://example.com/toto', None,
@@ -178,12 +181,12 @@
398                                                   sentence=Sentence(indice=0, start=0, end=34)))])
399 
400      def test_nerdy_process_add_preprocess(self):
401          """ Test nerdy process """
402          text = 'Hello Toto, this is   me speaking. And me.'
403 -        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
404 -                                          'me': 'http://example.com/me'})
405 +        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
406 +                                   'me': 'http://example.com/me'})
407          preprocessor = core.NerdyStopwordsFilterPreprocessor()
408          nerdy = core.NerdyProcess((source,),)
409          named_entities = nerdy.process_text(text)
410          self.assertEqual(named_entities,
411                           [('http://example.com/toto', None,
@@ -202,13 +205,13 @@
412                                                   sentence=Sentence(indice=0, start=0, end=34)))])
413 
414      def test_nerdy_process_chained_word(self):
415          """ Test nerdy process """
416          text = 'Hello everyone me, this is   me speaking. And me.'
417 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
418 -                                          'everyone me': 'http://example.com/everyone_me',
419 -                                          'me': 'http://example.com/me'})
420 +        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
421 +                                   'everyone me': 'http://example.com/everyone_me',
422 +                                   'me': 'http://example.com/me'})
423          nerdy = core.NerdyProcess((source,))
424          named_entities = nerdy.process_text(text)
425          self.assertEqual(named_entities,
426                           [('http://example.com/everyone_me', None,
427                             Token(word='everyone me', start=6, end=17,
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
@@ -15,11 +15,11 @@
428  #
429  # You should have received a copy of the GNU Lesser General Public License along
430  # with this program. If not, see <http://www.gnu.org/licenses/>.
431  import unittest2
432 
433 -from nerdy.tokenizer import RichStringTokenizer, Token, Sentence
434 +from nazca.utils.tokenizer import RichStringTokenizer, Token, Sentence
435 
436 
437  class TokenizerTest(unittest2.TestCase):
438      """ Test of tokenizer """
439 
diff --git a/utils/dataio.py b/utils/dataio.py
@@ -101,11 +101,10 @@
440      indexes form. If indexes is empty, keep raw output"""
441 
442      if not SPARQL_ENABLED:
443          raise ImportError("You have to install SPARQLWrapper and JSON modules to"
444                            "used this function")
445 -
446      sparql = SPARQLWrapper(endpoint)
447      sparql.setQuery(query)
448      sparql.setReturnFormat(JSON)
449      rawresults = sparql.query().convert()
450      labels = rawresults['head']['vars']
diff --git a/named_entities/tokenizer.py b/utils/tokenizer.py