[dataio] Merge dataio and tests, related to #187461

authorvincent.michel@logilab.fr
changesetec7d7ce1ca35
branchdefault
phasedraft
hiddenyes
parent revision#00d352769ba0 [ner] Cleanup Nerdy, related to #187461
child revision#cc142a884361 [named entities] Move tokenizer to utils and create a sources module for named entities, related to #187461
files modified by this revision
test/test_dataio.py
test/test_ner_dataio.py
utils/dataio.py
utils/ner_dataio.py
# HG changeset patch
# User vincent.michel@logilab.fr
# Date 1387464321 0
# Thu Dec 19 14:45:21 2013 +0000
# Node ID ec7d7ce1ca35a577deaa8f516b17a5d29f3b11f3
# Parent 00d352769ba0ff7f0d1e8366421322ad98af7e6b
[dataio] Merge dataio and tests, related to #187461

diff --git a/test/test_dataio.py b/test/test_dataio.py
@@ -20,12 +20,15 @@
1  import shutil
2  from contextlib import contextmanager
3  from os import path
4  from tempfile import mkdtemp
5 
6 -from nazca.utils.dataio import sparqlquery, parsefile, autocast, split_file
7 -
8 +from nazca.utils.dataio import (HTMLPrettyPrint, ValidXHTMLPrettyPrint,
9 +                                sparqlquery, rqlquery, parsefile,
10 +                                autocast, split_file)
11 +from nazca.named_entities import NerProcess
12 +from nazca.named_entities.sources import NerSourceLexicon
13 
14  TESTDIR = path.dirname(__file__)
15 
16  @contextmanager
17  def tempdir():
@@ -37,11 +40,58 @@
18              shutil.rmtree(temp)
19          except:
20              pass
21 
22 
23 +class ValidXHTMLPrettyPrintTest(unittest2.TestCase):
24 +
25 +    def test_valid(self):
26 +        from lxml import etree
27 +        if int(etree.__version__< '3.2.0'):
28 +            # https://bugs.launchpad.net/lxml/+bug/673205
29 +            self.skipTest('Lxml version to old for ValidXHTMLPrettyPrint')
30 +        self.assertTrue(ValidXHTMLPrettyPrint().is_valid(u'<p>coucou</p>'))
31 +
32 +    def test_valid_unicode(self):
33 +        from lxml import etree
34 +        if int(etree.__version__< '3.2.0'):
35 +            # https://bugs.launchpad.net/lxml/+bug/673205
36 +            self.skipTest('Lxml version to old for ValidXHTMLPrettyPrint')
37 +        self.assertTrue(ValidXHTMLPrettyPrint().is_valid(u'<p>hé</p>'))
38 +
39 +    def test_invalid(self):
40 +        from lxml import etree
41 +        if int(etree.__version__< '3.2.0'):
42 +            # https://bugs.launchpad.net/lxml/+bug/673205
43 +            self.skipTest('Lxml version to old for ValidXHTMLPrettyPrint')
44 +        self.assertFalse(ValidXHTMLPrettyPrint().is_valid(u'<p><div>coucou</div></p>'))
45 +
46 +    def test_prettyprint(self):
47 +        text = 'Hello everyone, this is   me speaking. And me.'
48 +        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
49 +                                   'me': 'http://example.com/me'})
50 +        ner = NerProcess((source,))
51 +        named_entities = ner.process_text(text)
52 +        html = HTMLPrettyPrint().pprint_text(text, named_entities)
53 +        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone">everyone</a>, '
54 +                                u'this is   <a href="http://example.com/me">me</a> speaking. '
55 +                                u'And <a href="http://example.com/me">me</a>.'))
56 +
57 +    def test_prettyprint_class(self):
58 +        text = 'Hello everyone, this is   me speaking. And me.'
59 +        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
60 +                                   'me': 'http://example.com/me'})
61 +        ner = NerProcess((source,))
62 +        named_entities = ner.process_text(text)
63 +        html = HTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner')
64 +        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, '
65 +                                u'this is   <a href="http://example.com/me" class="ner">me</a> speaking. '
66 +                                u'And <a href="http://example.com/me" class="ner">me</a>.'))
67 +
68 +
69  class DataIOTestCase(unittest2.TestCase):
70 +
71      def test_parser(self):
72          data = parsefile(path.join(TESTDIR, 'data', 'file2parse'),
73                           [0, (2, 3), 4, 1], delimiter=',')
74          self.assertEqual([[1, (12, 19), u'apple', u'house'],
75                            [2, (21.9, 19), u'stramberry', u'horse'],
@@ -80,10 +130,20 @@
76                  alllines.extend(lines)
77 
78              with open(file2split) as fobj:
79                  self.assertEqual(alllines, fobj.readlines())
80 
81 +    def test_sparql_query(self):
82 +        results = sparqlquery(u'http://dbpedia.org/sparql',
83 +                              u'''SELECT DISTINCT ?uri
84 +                                  WHERE{
85 +                                  ?uri rdfs:label "Python"@en .
86 +                                  ?uri rdf:type ?type}''')
87 +        self.assertEqual(results, [['http://dbpedia.org/resource/Python'],
88 +                                   ['http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'],
89 +                                   ['http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ']])
90 +
91      def test_sparql_autocast(self):
92          alignset = sparqlquery('http://dbpedia.inria.fr/sparql',
93                                   'prefix db-owl: <http://dbpedia.org/ontology/>'
94                                   'prefix db-prop: <http://fr.dbpedia.org/property/>'
95                                   'select ?ville, ?name, ?long, ?lat where {'
@@ -112,9 +172,14 @@
96                                   ' FILTER (?population > 1000)'
97                                   '} LIMIT 100', indexes=[0, 1, (2, 3)], autocaste_data=False)
98          self.assertEqual(len(alignset), 100)
99          self.assertFalse(isinstance(alignset[0][2][0], float))
100 
101 +    def test_rqlquery(self):
102 +        results = rqlquery('http://www.cubicweb.org',
103 +                           'Any U LIMIT 1 WHERE X cwuri U, X name "apycot"')
104 +        self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
105 +
106 
107  if __name__ == '__main__':
108      unittest2.main()
109 
diff --git a/test/test_ner_dataio.py b/test/test_ner_dataio.py
@@ -1,85 +0,0 @@
110 -# -*- coding:utf-8 -*-
111 -#
112 -# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
113 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
114 -#
115 -# This program is free software: you can redistribute it and/or modify it under
116 -# the terms of the GNU Lesser General Public License as published by the Free
117 -# Software Foundation, either version 2.1 of the License, or (at your option)
118 -# any later version.
119 -#
120 -# This program is distributed in the hope that it will be useful, but WITHOUT
121 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
122 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
123 -# details.
124 -#
125 -# You should have received a copy of the GNU Lesser General Public License along
126 -# with this program. If not, see <http://www.gnu.org/licenses/>.
127 -import unittest2
128 -
129 -from nerdy import dataio, core
130 -
131 -
132 -class DataioTest(unittest2.TestCase):
133 -    """ Test of dataio """
134 -
135 -    def test_sparql_query(self):
136 -        results = dataio.sparql_query(query=u'''SELECT ?uri
137 -                                                WHERE{
138 -                                                ?uri rdfs:label "Python"@en .
139 -                                                ?uri rdf:type ?type}''',
140 -                                      endpoint=u'http://dbpedia.org/sparql')
141 -        truth = [{u'uri':
142 -                  {u'type': u'uri',
143 -                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'}},
144 -                 {u'uri':
145 -                  {u'type': u'uri',
146 -                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'}}]
147 -        self.assertEqual(results, truth)
148 -
149 -    def test_rql_url_query(self):
150 -        results = dataio.rql_url_query('Any U LIMIT 1 WHERE X cwuri U, X name "apycot"',
151 -                                       'http://www.cubicweb.org')
152 -        self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
153 -
154 -    def test_prettyprint(self):
155 -        text = 'Hello everyone, this is   me speaking. And me.'
156 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
157 -                                          'me': 'http://example.com/me'})
158 -        nerdy = core.NerdyProcess((source,))
159 -        named_entities = nerdy.process_text(text)
160 -        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities)
161 -        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone">everyone</a>, '
162 -                                u'this is   <a href="http://example.com/me">me</a> speaking. '
163 -                                u'And <a href="http://example.com/me">me</a>.'))
164 -
165 -    def test_prettyprint_class(self):
166 -        text = 'Hello everyone, this is   me speaking. And me.'
167 -        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
168 -                                          'me': 'http://example.com/me'})
169 -        nerdy = core.NerdyProcess((source,))
170 -        named_entities = nerdy.process_text(text)
171 -        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner')
172 -        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, '
173 -                                u'this is   <a href="http://example.com/me" class="ner">me</a> speaking. '
174 -                                u'And <a href="http://example.com/me" class="ner">me</a>.'))
175 -
176 -
177 -class NerdyValidXHTMLPrettyPrintTest(unittest2.TestCase):
178 -
179 -    def test_valid(self):
180 -        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
181 -            '<p>coucou</p>'))
182 -
183 -    def test_valid_unicode(self):
184 -        self.assertTrue(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
185 -            u'<p>hé</p>'))
186 -
187 -    def test_invalid(self):
188 -        self.assertFalse(dataio.NerdyValidXHTMLPrettyPrint().is_valid(
189 -            '<p><div>coucou</div></p>'))
190 -
191 -
192 -if __name__ == '__main__':
193 -    unittest2.main()
194 -
diff --git a/utils/dataio.py b/utils/dataio.py
@@ -19,10 +19,12 @@
195  from os import path as osp
196 
197  import csv
198  import urllib
199 
200 +from lxml import etree
201 +
202  try:
203      from SPARQLWrapper import SPARQLWrapper, JSON
204      SPARQL_ENABLED = True
205  except ImportError:
206      SPARQL_ENABLED = False
@@ -48,24 +50,49 @@
207 
208 
209  ###############################################################################
210  ### RQL FUNCTIONS #############################################################
211  ###############################################################################
212 -def rqlquery(host, rql, indexes=None, formatopt=None):
213 -    """ Run the rql query on the given cubicweb host
214 +def get_cw_cnx(endpoint):
215 +    """ Get a cnx on a CubicWeb database
216      """
217 -
218 -    if host.endswith('/'):
219 -        host = host[:-1]
220 +    from cubicweb import dbapi
221 +    from cubicweb.cwconfig import CubicWebConfiguration
222 +    from cubicweb.entities import AnyEntity
223 +    CubicWebConfiguration.load_cwctl_plugins()
224 +    config = CubicWebConfiguration.config_for(endpoint)
225 +    sourceinfo = config.sources()['admin']
226 +    login = sourceinfo['login']
227 +    password = sourceinfo['password']
228 +    _, cnx = dbapi.in_memory_repo_cnx(config, login, password=password)
229 +    req = cnx.request()
230 +    return req
231 
232 -    indexes = indexes or []
233 -    filehandle = urllib.urlopen('%(host)s/view?'
234 -                                'rql=%(rql)s&vid=csvexport'
235 -                                % {'rql': rql, 'host': host})
236 -    filehandle.readline()#Skip the first line
237 -    return parsefile(filehandle, delimiter=';', indexes=indexes,
238 -                     formatopt=formatopt);
239 +def rqlquery(host, rql, indexes=None, formatopt=None, _cache_cnx={}, **kwargs):
240 +    """ Run the rql query on the given cubicweb host
241 +    Additional arguments can be passed to be properly substitued
242 +    in the execute() function for appid accces.
243 +    """
244 +    if host.startswith('http://'):
245 +        # By url
246 +        if host.endswith('/'):
247 +            host = host[:-1]
248 +        indexes = indexes or []
249 +        filehandle = urllib.urlopen('%(host)s/view?'
250 +                                    'rql=%(rql)s&vid=csvexport'
251 +                                    % {'rql': rql, 'host': host})
252 +        filehandle.readline()#Skip the first line
253 +        return parsefile(filehandle, delimiter=';', indexes=indexes,
254 +                         formatopt=formatopt);
255 +    else:
256 +        # By appid
257 +        if host in _cache_cnx:
258 +            cnx = _cache_cnx[host]
259 +        else:
260 +            cnx = get_cw_cnx(host)
261 +            _cache_cnx[host] = cnx
262 +        return cnx.execute(query, kwargs)
263 
264 
265  ###############################################################################
266  ### SPARQL FUNCTIONS ##########################################################
267  ###############################################################################
@@ -220,5 +247,75 @@
268                  continue
269              outfile.write(line)
270          outfile.close()
271          count += 1
272      return map(str, xrange(count))
273 +
274 +
275 +###############################################################################
276 +### OUTPUT UTILITIES ##########################################################
277 +###############################################################################
278 +class AbstractPrettyPrint(object):
279 +    """ Pretty print the output of a named entities process
280 +    """
281 +
282 +    def pprint_text(self, text, named_entities, **kwargs):
283 +        newtext = u''
284 +        indice = 0
285 +        tindices = dict([(t.start, (uri, t)) for uri, p, t in named_entities])
286 +        while indice < len(text):
287 +            if indice in tindices:
288 +                uri, t = tindices[indice]
289 +                words = text[t.start:t.end]
290 +                fragment = self.pprint_entity(uri, words, **kwargs)
291 +                if not self.is_valid(newtext+fragment+text[t.end:]):
292 +                    fragment = words
293 +                newtext += fragment
294 +                indice = t.end
295 +            else:
296 +                newtext += text[indice]
297 +                indice += 1
298 +        return newtext
299 +
300 +    def pprint_entity(self, uri, word, **kwargs):
301 +        """ Pretty print an entity """
302 +        raise NotImplementedError
303 +
304 +    def is_valid(self, newtext):
305 +        """Override to check the validity of the prettified content at each
306 +        enrichement step"""
307 +        return True
308 +
309 +
310 +class HTMLPrettyPrint(AbstractPrettyPrint):
311 +    """ Pretty print the output of a named entities process, in HTML
312 +    """
313 +
314 +    def pprint_entity(self, uri, word, **kwargs):
315 +        """ Pretty print an entity """
316 +        klass = ' class="%s"' % kwargs['html_class'] if 'html_class' in kwargs else ''
317 +        return u'<a href="%s"%s>%s</a>' % (uri, klass, word)
318 +
319 +
320 +class ValidXHTMLPrettyPrint(HTMLPrettyPrint):
321 +    """ Pretty print the output of a named entities process,
322 +    in valid XHTML.
323 +    """
324 +
325 +    XHTML_DOC_TEMPLATE = '''\
326 +<?xml version="1.0" encoding="UTF-8" ?>
327 +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
328 +<html xmlns="http://www.w3.org/1999/xhtml">
329 +<head>
330 +<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
331 +<title>ner</title>
332 +</head>
333 +<body><div>%s</div></body>
334 +</html>'''
335 +
336 +    def is_valid(self, html):
337 +        try:
338 +            etree.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
339 +                          parser=etree.XMLParser(dtd_validation=True))
340 +        except etree.XMLSyntaxError:
341 +            return False
342 +        return True
diff --git a/utils/ner_dataio.py b/utils/ner_dataio.py
@@ -1,140 +0,0 @@
343 -# -*- coding: utf-8 -*-
344 -""" IO for Named Entities Recognition.
345 -"""
346 -import json
347 -import urllib
348 -import lxml.etree as ET
349 -
350 -
351 -###############################################################################
352 -### SPARQL UTILITIES ##########################################################
353 -###############################################################################
354 -def sparql_query(query, endpoint):
355 -    """ Execute a query on an endpoint:
356 -
357 -    sparql_query(query=u'''SELECT ?uri ?type
358 -                           WHERE{
359 -                           ?uri rdfs:label "Python"@en .
360 -                           ?uri rdf:type ?type}''',
361 -                           endpoint=u'http://dbpedia.org/sparql')
362 -    """
363 -    from SPARQLWrapper import SPARQLWrapper, JSON
364 -    sparql = SPARQLWrapper(endpoint)
365 -    sparql.setQuery(query)
366 -    sparql.setReturnFormat(JSON)
367 -    try:
368 -        rawresults = sparql.query().convert()
369 -        labels = rawresults['head']['vars']
370 -        return rawresults["results"]["bindings"]
371 -    except:
372 -        print 'Error in sparql query'
373 -        return []
374 -
375 -
376 -###############################################################################
377 -### RQL UTILITIES #############################################################
378 -###############################################################################
379 -def get_cw_cnx(endpoint):
380 -    """ Get a cnx on a CubicWeb database
381 -    """
382 -    from cubicweb import dbapi
383 -    from cubicweb.cwconfig import CubicWebConfiguration
384 -    from cubicweb.entities import AnyEntity
385 -    CubicWebConfiguration.load_cwctl_plugins()
386 -    config = CubicWebConfiguration.config_for(endpoint)
387 -    sourceinfo = config.sources()['admin']
388 -    login = sourceinfo['login']
389 -    password = sourceinfo['password']
390 -    _, cnx = dbapi.in_memory_repo_cnx(config, login, password=password)
391 -    req = cnx.request()
392 -    return req
393 -
394 -def rql_appid_query(query, endpoint, _cache_cnx={}, **kwargs):
395 -    """ Execute a query on an appid endpoint:
396 -
397 -    rql_query('Any X WHERE X label "Python"', 'localhost')
398 -
399 -    Additional arguments can be passed to be properly substitued
400 -    in the execute() function.
401 -    """
402 -    if endpoint in _cache_cnx:
403 -        cnx = _cache_cnx[endpoint]
404 -    else:
405 -        cnx = get_cw_cnx(endpoint)
406 -        _cache_cnx[endpoint] = cnx
407 -    return cnx.execute(query, kwargs)
408 -
409 -def rql_url_query(query, endpoint):
410 -    """ Execute a query on an url endpoint:
411 -
412 -    rql_query('Any X WHERE X label "Python"', 'localhost')
413 -    """
414 -    url = urllib.basejoin(endpoint, '?rql=%s&vid=jsonexport' % query)
415 -    return json.loads(urllib.urlopen(url).read())
416 -
417 -
418 -###############################################################################
419 -### OUTPUT UTILITIES ##########################################################
420 -###############################################################################
421 -class AbstractNerdyPrettyPrint(object):
422 -    """ Pretty print the output of a Nerdy process
423 -    """
424 -
425 -    def pprint_text(self, text, named_entities, **kwargs):
426 -        newtext = u''
427 -        indice = 0
428 -        tindices = dict([(t.start, (uri, t)) for uri, p, t in named_entities])
429 -        while indice < len(text):
430 -            if indice in tindices:
431 -                uri, t = tindices[indice]
432 -                words = text[t.start:t.end]
433 -                fragment = self.pprint_entity(uri, words, **kwargs)
434 -                if not self.is_valid(newtext+fragment+text[t.end:]):
435 -                    fragment = words
436 -                newtext += fragment
437 -                indice = t.end
438 -            else:
439 -                newtext += text[indice]
440 -                indice += 1
441 -        return newtext
442 -
443 -    def pprint_entity(self, uri, word, **kwargs):
444 -        """ Pretty print an entity """
445 -        raise NotImplementedError
446 -
447 -    def is_valid(self, newtext):
448 -        """Override to check the validity of the prettified content at each
449 -        enrichement step"""
450 -        return True
451 -
452 -
453 -class NerdyHTMLPrettyPrint(AbstractNerdyPrettyPrint):
454 -    """ Pretty print the output of a Nerdy process
455 -    """
456 -
457 -    def pprint_entity(self, uri, word, **kwargs):
458 -        """ Pretty print an entity """
459 -        klass = ' class="%s"' % kwargs['html_class'] if 'html_class' in kwargs else ''
460 -        return u'<a href="%s"%s>%s</a>' % (uri, klass, word)
461 -
462 -
463 -class NerdyValidXHTMLPrettyPrint(NerdyHTMLPrettyPrint):
464 -
465 -    XHTML_DOC_TEMPLATE = '''\
466 -<?xml version="1.0" encoding="UTF-8" ?>
467 -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
468 -<html xmlns="http://www.w3.org/1999/xhtml">
469 -<head>
470 -<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
471 -<title>nerdy</title>
472 -</head>
473 -<body><div>%s</div></body>
474 -</html>'''
475 -
476 -    def is_valid(self, html):
477 -        try:
478 -            ET.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
479 -                          parser=ET.XMLParser(dtd_validation=True))
480 -        except ET.XMLSyntaxError:
481 -            return False
482 -        return True