[utils] Create an utils folder, related to #187461

authorvincent.michel@logilab.fr
changesetb323882735ec
branchdefault
phasedraft
hiddenyes
parent revision#1cff8a50b49f [ner] Remove unused files and move tests, related to #187461
child revision#36951167576c [pkginfo] Rename ner in named_entities and update pkginfo, related to #187461
files modified by this revision
dataio.py
distances.py
minhashing.py
ner/dataio.py
normalize.py
record_linkage/aligner.py
record_linkage/blocking.py
test/test_alignment.py
test/test_blocking.py
test/test_dataio.py
test/test_distances.py
test/test_minhashing.py
test/test_normalize.py
utils/__init__.py
utils/dataio.py
utils/distances.py
utils/minhashing.py
utils/ner_dataio.py
utils/normalize.py
# HG changeset patch
# User vincent.michel@logilab.fr
# Date 1387464298 0
# Thu Dec 19 14:44:58 2013 +0000
# Node ID b323882735ecefe2f96381e3500fd3c91eef217d
# Parent 1cff8a50b49f3eab55783615a9835b4827c6ced7
[utils] Create an utils folder, related to #187461

diff --git a/dataio.py b/dataio.py
@@ -1,224 +0,0 @@
1 -# -*- coding:utf-8 -*-
2 -# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
3 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
4 -#
5 -# This program is free software: you can redistribute it and/or modify it under
6 -# the terms of the GNU Lesser General Public License as published by the Free
7 -# Software Foundation, either version 2.1 of the License, or (at your option)
8 -# any later version.
9 -#
10 -# This program is distributed in the hope that it will be useful, but WITHOUT
11 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
13 -# details.
14 -#
15 -# You should have received a copy of the GNU Lesser General Public License along
16 -# with this program. If not, see <http://www.gnu.org/licenses/>.
17 -
18 -from os.path import exists as fileexists
19 -from os import path as osp
20 -
21 -import csv
22 -import urllib
23 -
24 -try:
25 -    from SPARQLWrapper import SPARQLWrapper, JSON
26 -    SPARQL_ENABLED = True
27 -except ImportError:
28 -    SPARQL_ENABLED = False
29 -
30 -
31 -###############################################################################
32 -### UTILITY FUNCTIONS #########################################################
33 -###############################################################################
34 -def autocast(data, encoding=None):
35 -    """ Try to convert data into a specific type
36 -    in (int, float, str)
37 -    """
38 -    try:
39 -        return int(data)
40 -    except ValueError:
41 -        try:
42 -            return float(data.replace(',', '.'))
43 -        except ValueError:
44 -            data = data.strip()
45 -            if encoding:
46 -                return data.decode(encoding)
47 -            return data
48 -
49 -
50 -###############################################################################
51 -### RQL FUNCTIONS #############################################################
52 -###############################################################################
53 -def rqlquery(host, rql, indexes=None, formatopt=None):
54 -    """ Run the rql query on the given cubicweb host
55 -    """
56 -
57 -    if host.endswith('/'):
58 -        host = host[:-1]
59 -
60 -    indexes = indexes or []
61 -    filehandle = urllib.urlopen('%(host)s/view?'
62 -                                'rql=%(rql)s&vid=csvexport'
63 -                                % {'rql': rql, 'host': host})
64 -    filehandle.readline()#Skip the first line
65 -    return parsefile(filehandle, delimiter=';', indexes=indexes,
66 -                     formatopt=formatopt);
67 -
68 -
69 -###############################################################################
70 -### SPARQL FUNCTIONS ##########################################################
71 -###############################################################################
72 -def sparqlquery(endpoint, query, indexes=None, autocaste_data=True):
73 -    """ Run the sparql query on the given endpoint, and wrap the items in the
74 -    indexes form. If indexes is empty, keep raw output"""
75 -
76 -    if not SPARQL_ENABLED:
77 -        raise ImportError("You have to install SPARQLWrapper and JSON modules to"
78 -                          "used this function")
79 -
80 -    sparql = SPARQLWrapper(endpoint)
81 -    sparql.setQuery(query)
82 -    sparql.setReturnFormat(JSON)
83 -    rawresults = sparql.query().convert()
84 -    labels = rawresults['head']['vars']
85 -    results = []
86 -    indexes = indexes or []
87 -    if autocaste_data:
88 -        transform = autocast
89 -    else:
90 -        def transform(*args): return args
91 -    for raw in rawresults["results"]["bindings"]:
92 -        data = []
93 -        if not indexes:
94 -            data = [transform(raw[label]['value']) for label in labels]
95 -        else:
96 -            for il, ind in enumerate(indexes):
97 -                if isinstance(ind, tuple):
98 -                    data.append(tuple([transform(raw[labels[i]]['value']) for i in ind]))
99 -                else:
100 -                    data.append(transform(raw[labels[il]]['value']))
101 -        results.append(data)
102 -    return results
103 -
104 -
105 -###############################################################################
106 -### FILE FUNCTIONS ############################################################
107 -###############################################################################
108 -def parsefile(filename, indexes=None, nbmax=None, delimiter='\t',
109 -              encoding='utf-8', field_size_limit=None, formatopt=None):
110 -    """ Parse the file (read ``nbmax`` line at maximum if given). Each
111 -        line is splitted according ``delimiter`` and only ``indexes`` are kept
112 -
113 -        eg : The file is :
114 -                1, house, 12, 19, apple
115 -                2, horse, 21.9, 19, stramberry
116 -                3, flower, 23, 2.17, cherry
117 -
118 -            >>> data = parsefile('myfile', [0, (2, 3), 4, 1], delimiter=',')
119 -            data = [[1, (12, 19), u'apple', u'house'],
120 -                    [2, (21.9, 19), u'stramberry', u'horse'],
121 -                    [3, (23, 2.17), u'cherry', u'flower']]
122 -
123 -            By default, all cells are "autocast" (thanks to the
124 -            ``autocast()`` function), but you can overpass it thanks to the
125 -            ``formatopt`` dictionnary. Each key is the index to work on, and the
126 -            value is the function to call. See the following example:
127 -
128 -            >>> data = parsefile('myfile', [0, (2, 3), 4, 1], delimiter=',',
129 -            >>>                  formatopt={2:lambda x:x.decode('utf-8')})
130 -            data = [[1, (u'12', 19), u'apple', u'house'],
131 -                    [2, (u'21.9', 19), u'stramberry', u'horse'],
132 -                    [3, (u'23', 2.17), u'cherry', u'flower']]
133 -
134 -    """
135 -    def formatedoutput(filename):
136 -        if field_size_limit:
137 -            csv.field_size_limit(field_size_limit)
138 -
139 -        if isinstance(filename, basestring):
140 -            csvfile = open(filename, 'r')
141 -        else:
142 -            csvfile = filename
143 -        reader = csv.reader(csvfile, delimiter=delimiter)
144 -        for row in reader:
145 -            yield [cell.strip() for cell in row]
146 -        csvfile.close()
147 -
148 -
149 -
150 -    result = []
151 -    indexes = indexes or []
152 -    formatopt = formatopt or {}
153 -    for ind, row in enumerate(formatedoutput(filename)):
154 -        row = [formatopt.get(i, lambda x: autocast(x, encoding))(cell)
155 -               for i, cell in enumerate(row)]
156 -        data = []
157 -        if nbmax and ind > nbmax:
158 -            break
159 -        if not indexes:
160 -            data = row
161 -        else:
162 -            for ind in indexes:
163 -                if isinstance(ind, tuple):
164 -                    data.append(tuple([row[i] for i in ind]))
165 -                    if '' in data[-1]:
166 -                        data[-1] = None
167 -                elif row[ind]:
168 -                    data.append(row[ind])
169 -                else:
170 -                    data.append(None)
171 -
172 -        result.append(data)
173 -    return result
174 -
175 -def write_results(matched, alignset, targetset, resultfile):
176 -    """ Given a matched dictionnay, an alignset and a targetset to the
177 -        resultfile
178 -    """
179 -    openmode = 'a' if fileexists(resultfile) else 'w'
180 -    with open(resultfile, openmode) as fobj:
181 -        if openmode == 'w':
182 -            fobj.write('aligned;targetted;distance\n')
183 -        for aligned in matched:
184 -            for target, dist in matched[aligned]:
185 -                alignid = alignset[aligned][0]
186 -                targetid = targetset[target][0]
187 -                fobj.write('%s;%s;%s\n' %
188 -                    (alignid.encode('utf-8') if isinstance(alignid, basestring)
189 -                                             else alignid,
190 -                     targetid.encode('utf-8') if isinstance(targetid, basestring)
191 -                                              else targetid,
192 -                     dist
193 -                     ))
194 -
195 -def split_file(filename, outputdir, nblines=60000):
196 -    """ Split `filename` into smaller files of ``nblines`` lines. Files are
197 -        written into `outputdir`.
198 -
199 -        Return the list of files
200 -    """
201 -    NEW = object()
202 -
203 -    def readlines(fobj, nblines):
204 -        """ yield all lines of the file, and
205 -        at split-file boundaries, yield a NEW marker
206 -        """
207 -        for index, line in enumerate(fobj):
208 -            if index and index % nblines == 0:
209 -                yield NEW
210 -            yield line
211 -
212 -    count = 0
213 -    with open(filename, 'rb') as fobj:
214 -        outfile = open(osp.join(outputdir, '%s' % count), 'wb')
215 -        for line in readlines(fobj, nblines):
216 -            if line is NEW:
217 -                outfile.close()
218 -                count += 1
219 -                outfile = open(osp.join(outputdir, '%s' % count), 'wb')
220 -                continue
221 -            outfile.write(line)
222 -        outfile.close()
223 -        count += 1
224 -    return map(str, xrange(count))
diff --git a/distances.py b/distances.py
@@ -1,456 +0,0 @@
225 -# -*- coding:utf-8 -*-
226 -# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
227 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
228 -#
229 -# This program is free software: you can redistribute it and/or modify it under
230 -# the terms of the GNU Lesser General Public License as published by the Free
231 -# Software Foundation, either version 2.1 of the License, or (at your option)
232 -# any later version.
233 -#
234 -# This program is distributed in the hope that it will be useful, but WITHOUT
235 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
236 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
237 -# details.
238 -#
239 -# You should have received a copy of the GNU Lesser General Public License along
240 -# with this program. If not, see <http://www.gnu.org/licenses/>.
241 -
242 -from functools import partial
243 -from math import cos, sqrt, pi #Needed for geographical distance
244 -try:
245 -    from dateutil import parser as dateparser
246 -    DATEUTIL_ENABLED = True
247 -except ImportError:
248 -    DATEUTIL_ENABLED = False
249 -from scipy import matrix, empty
250 -
251 -from nazca.normalize import tokenize
252 -
253 -
254 -###############################################################################
255 -### UTILITY FUNCTIONS #########################################################
256 -###############################################################################
257 -def cdist(distance_callback, refset, targetset, matrix_normalized=False,
258 -          ref_indexes=None, target_indexes=None):
259 -    """ Compute the metric matrix, given two datasets and a metric
260 -
261 -    Parameters
262 -    ----------
263 -    refset: a dataset (list of records)
264 -
265 -    targetset: a dataset (list of records)
266 -
267 -    Returns
268 -    -------
269 -
270 -    A distance matrix, of shape (len(refset), len(targetset))
271 -    with the distance of each element in it.
272 -    """
273 -    ref_indexes = ref_indexes or xrange(len(refset))
274 -    target_indexes = target_indexes or xrange(len(targetset))
275 -    distmatrix = empty((len(ref_indexes), len(target_indexes)), dtype='float32')
276 -    size = distmatrix.shape
277 -    for i, iref in enumerate(ref_indexes):
278 -        for j, jref in enumerate(target_indexes):
279 -            d = 1
280 -            if refset[iref] and targetset[jref]:
281 -                d = distance_callback(refset[iref], targetset[jref])
282 -                if matrix_normalized:
283 -                    d = 1 - (1.0/(1.0 + d))
284 -            distmatrix[i, j] = d
285 -    return distmatrix
286 -
287 -def _handlespaces(stra, strb, distance, tokenizer=None, **kwargs):
288 -    """ Compute the matrix of distances between all tokens of stra and strb
289 -        (with function ``distance``). Extra args are given to the distance
290 -        function
291 -
292 -        The distance returned is defined as the max of the min of each rows of
293 -        each distance matrix, see the example above :
294 -
295 -                 |  Victor |  Hugo                  Victor | Jean | Hugo
296 -         Victor  |     0   |    5           Victor |  0    |  6   |  5
297 -          Jean   |     6   |    4           Hugo   |  5    |  4   |  0
298 -          Hugo   |     5   |    0
299 -
300 -                 --> 4                                --> 0
301 -
302 -        Return 4
303 -    """
304 -
305 -    if ' ' not in stra:
306 -        stra += ' '
307 -    if ' ' not in strb:
308 -        strb += ' '
309 -
310 -    toka = tokenize(stra, tokenizer)
311 -    tokb = tokenize(strb, tokenizer)
312 -    # If not same number of tokens, complete the smallest list with empty strings
313 -    if len(toka) != len(tokb):
314 -        mint = toka if len(toka)<len(tokb) else tokb
315 -        maxt = toka if len(toka)>len(tokb) else tokb
316 -        mint.extend(['' for i in range(len(maxt)-len(mint))])
317 -
318 -    listmatrix = []
319 -    for i in xrange(len(toka)):
320 -        listmatrix.append([distance(toka[i], tokb[j], **kwargs) for j in xrange(len(tokb))])
321 -    m = matrix(listmatrix)
322 -    minlist = [m[i,:].min() for i in xrange(m.shape[0])]
323 -    minlist.extend([m[:,i].min() for i in xrange(m.shape[1])])
324 -    return max(minlist)
325 -
326 -
327 -###############################################################################
328 -### NUMERICAL DISTANCES #######################################################
329 -###############################################################################
330 -def euclidean(a, b):
331 -    """ Simple euclidian distance
332 -    """
333 -    try:
334 -        return abs(a - b)
335 -    except TypeError:
336 -        #a and b may be strings
337 -        return abs(float(a) - float(b))
338 -
339 -
340 -###############################################################################
341 -### STRING DISTANCES ##########################################################
342 -###############################################################################
343 -def levenshtein(stra, strb, tokenizer=None):
344 -    """ Compute the Levenshtein distance between stra and strb.
345 -
346 -    The Levenshtein distance is defined as the minimal cost to transform stra
347 -    into strb, where 3 operators are allowed :
348 -        - Replace one character of stra into a character of strb
349 -        - Add one character of strb into stra
350 -        - Remove one character of strb
351 -
352 -        If spaces are found in stra or strb, this method returns
353 -            _handlespaces(stra, strb, levenshtein)
354 -    """
355 -    if ' ' in stra or ' ' in strb:
356 -        return _handlespaces(stra, strb, levenshtein, tokenizer)
357 -
358 -    lenb = len(strb)
359 -    onerowago = None
360 -    thisrow = range(1, lenb + 1) + [0]
361 -    for x in xrange(len(stra)):
362 -        onerowago, thisrow = thisrow, [0]*lenb + [x+1]
363 -        for y in xrange(lenb):
364 -            delcost = onerowago[y] + 1
365 -            addcost = thisrow[y - 1] + 1
366 -            subcost = onerowago[y - 1] + (stra[x] != strb[y])
367 -            thisrow[y] = min(delcost, addcost, subcost)
368 -    return thisrow[lenb - 1]
369 -
370 -def soundexcode(word, language='french'):
371 -    """ Return the Soundex code of the word ``word``
372 -        For more information about soundex code see wiki_
373 -
374 -        ``language`` can be 'french' or 'english'
375 -
376 -        .:: wiki_ : https://en.wikipedia.org/wiki/Soundex
377 -
378 -        If spaces are found in stra or strb, this method returns
379 -            _handlespaces(stra, strb), soundex, language=language)
380 -    """
381 -
382 -    vowels = 'AEHIOUWY'
383 -    if language.lower() == 'french' :
384 -        consonnantscode = {'B': '1', 'P': '1',
385 -                           'C': '2', 'K': '2', 'Q': '2',
386 -                           'D': '3', 'T': '3',
387 -                           'L': '4',
388 -                           'M': '5', 'N': '5',
389 -                           'R': '6',
390 -                           'G': '7', 'J': '7',
391 -                           'X': '8', 'Z': '8', 'S': '8',
392 -                           'F': '9', 'V': '9'
393 -                          }
394 -    elif language.lower() == 'english':
395 -        consonnantscode = {'B': '1', 'F': '1', 'P': '1', 'V': '1',
396 -                           'C': '2', 'G': '2', 'J': '2', 'K': '2',
397 -                           'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
398 -                           'D': '3', 'T': '3',
399 -                           'L': '4',
400 -                           'M': '5', 'N': '5',
401 -                           'R': '6'
402 -                          }
403 -    else:
404 -        raise NotImplementedError('Soundex code is not supported (yet ?) for'
405 -                                  'this language (%s). '
406 -                                  'Supported languages are french and english' % language)
407 -    word = word.strip().upper()
408 -    code = word[0]
409 -    #After this ``for`` code is
410 -    # the first letter of ``word`` followed by all the consonnants of word,
411 -    # where from consecutive consonnants, only the first is kept,
412 -    # and from two identical consonnants separated by a W or a H, only the first
413 -    # is kept too.
414 -    for i in xrange(1, len(word)):
415 -        if word[i] in vowels:
416 -            continue
417 -        if word[i - 1] not in vowels and \
418 -           consonnantscode[word[i]] == consonnantscode.get(code[-1], ''):
419 -            continue
420 -        if i + 2 < len(word) and word[i + 1] in 'WH' and \
421 -           consonnantscode[word[i]] == consonnantscode.get(word[i + 2], ''):
422 -            continue
423 -        code += word[i]
424 -        if len(code) > 4:
425 -            break
426 -
427 -    #Replace according to the codes
428 -    code = code[0] + ''.join([consonnantscode[c] for c in code[1:]])
429 -    ###First four letters, completed by zeros
430 -    return code[:4] + '0'*(4 - len(code))
431 -
432 -def soundex(stra, strb, language='french', tokenizer=None):
433 -    """ Return the 1/0 distance between the soundex code of stra and strb.
434 -        0 means they have the same code, 1 they don't
435 -    """
436 -    if ' ' in stra or ' ' in strb:
437 -        return _handlespaces(stra, strb, soundex, tokenizer=tokenizer, language=language)
438 -
439 -    return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) \
440 -             else 1
441 -
442 -def jaccard(stra, strb, tokenizer=None):
443 -    """ Return the jaccard distance between stra and strb, condering the tokens
444 -        set of stra and strb. If no tokenizer is given, it use if
445 -        alignement.normalize.tokenize's default one.
446 -
447 -        J(A, B) = (A \cap B)/(A \cup B)
448 -        d(A, B) = 1 - J(A, B)
449 -    """
450 -    seta = set(tokenize(stra, tokenizer))
451 -    setb = set(tokenize(strb, tokenizer))
452 -    return generic_jaccard(seta, setb)
453 -
454 -def generic_jaccard(seta, setb):
455 -    """ Return the jaccard distance between two sets A and B.
456 -
457 -        J(A, B) = (A \cap B)/(A \cup B)
458 -        d(A, B) = 1 - J(A, B)
459 -    """
460 -    return 1.0 - 1.0*len(seta.intersection(setb))/len(seta.union(setb))
461 -
462 -
463 -###############################################################################
464 -### TEMPORAL DISTANCES ########################################################
465 -###############################################################################
466 -if DATEUTIL_ENABLED:
467 -    class FrenchParserInfo(dateparser.parserinfo):
468 -        """ Inherit of the dateutil.parser.parserinfo and translate the english
469 -            dependant variables into french.
470 -        """
471 -
472 -        HMS = [(u'h', u'heure', u'heures'),
473 -               (u'm', u'minute', u'minutes'),
474 -                    (u's', u'seconde', u'seconde'),]
475 -        JUMP = [u' ', u'.', u',', u';', u'-', u'/', u"'",
476 -               u'a', u'le', u'et', u'er']
477 -        MONTHS = [(u'Jan', u'Janvier'), (u'Fev', u'Fevrier'),
478 -                  (u'Mar', u'Mars'), (u'Avr', u'Avril'), (u'Mai', u'Mai'),
479 -                  (u'Jun', u'Juin'), (u'Jui', u'Juillet'),
480 -                  (u'Aou', u'Aout'), (u'Sep', u'Septembre'),
481 -                  (u'Oct', u'Octobre'), (u'Nov', u'Novembre'),
482 -                  (u'Dec', u'Decembre')]
483 -        PERTAIN = [u'de']
484 -        WEEKDAYS = [(u'Lun', u'Lundi'),
485 -                    (u'Mar', u'Mardi'),
486 -                    (u'Mer', u'Mercredi'),
487 -                    (u'Jeu', u'Jeudi'),
488 -                    (u'Ven', u'Vendredi'),
489 -                    (u'Sam', u'Samedi'),
490 -                    (u'Dim', u'Dimanche')]
491 -
492 -    def temporal(stra, strb, granularity=u'days', parserinfo=FrenchParserInfo,
493 -                 dayfirst=True, yearfirst=False):
494 -        """ Return the distance between two strings (read as dates).
495 -
496 -            ``granularity`` can be either ``days`` or ``months`` or ``years``
497 -            (be careful to the plural form !)
498 -            ``language`` can be either french or english
499 -
500 -            ``dayfirst`` and ``yearfirst`` are used in case of ambiguity, for
501 -            instance 09/09/09, by default it assumes it's day/month/year
502 -
503 -            Neither stra nor strb can have accent. Clean it before.
504 -        """
505 -
506 -        datea = dateparser.parse(stra, parserinfo=parserinfo(dayfirst,
507 -                                 yearfirst), fuzzy=True)
508 -        dateb = dateparser.parse(strb, parserinfo=parserinfo(dayfirst,
509 -                                 yearfirst), fuzzy=True)
510 -        diff = datea - dateb
511 -        if granularity.lower() == 'years':
512 -            return abs(diff.days/365.25)
513 -        if granularity.lower() == 'months':
514 -            return abs(diff.days/30.5)
515 -        return abs(diff.days)
516 -
517 -
518 -###############################################################################
519 -### GEOGRAPHICAL DISTANCES ####################################################
520 -###############################################################################
521 -def geographical(pointa, pointb, in_radians=False, planet_radius=6371009,
522 -                 units='m'):
523 -    """ Return the geographical distance between two points.
524 -
525 -        Both points must be tuples (latitude, longitude)
526 -
527 -        - in_radians is True, if latitude and longitude are in radians, false
528 -          otherwise
529 -        - planetRadius is the planet's radius in meters. By default, it's the
530 -          Earth'one.
531 -
532 -        - `units` can be 'm' (meters) or 'km' (kilometers)
533 -    """
534 -    pointa = (float(pointa[0]), float(pointa[1]))
535 -    pointb = (float(pointb[0]), float(pointb[1]))
536 -
537 -    difflat = pointa[0] - pointb[0]
538 -    difflong = pointa[1] - pointb[1]
539 -    meanlat = (pointa[0] + pointb[0])/2.0
540 -
541 -    if not in_radians:
542 -        difflat *= pi/180.0
543 -        difflong *= pi/180.0
544 -        meanlat *= pi/180.0
545 -
546 -    coef = 1. if units == 'm' else 0.001
547 -    return coef*planet_radius*sqrt(difflat**2 + (cos(meanlat)*difflong)**2)
548 -
549 -
550 -###############################################################################
551 -### BASE PROCESSING ############################################################
552 -###############################################################################
553 -class BaseProcessing(object):
554 -    """ A processing object used to provide an abstraction over the different
555 -    distance functions, and help building Nazca process. """
556 -
557 -    def __init__(self, ref_attr_index=None, target_attr_index=None,
558 -                 distance_callback=euclidean, weight=1, matrix_normalized=False):
559 -        """ Initiate the BaseProcessing
560 -
561 -        Parameters
562 -        ----------
563 -
564 -        ref_attr_index: index of the attribute of interest in a record
565 -                        for the reference dataset
566 -                        (i.e. attribute to be used for key computation)
567 -
568 -        target_attr_index: index of the attribute of interest in a record
569 -                           for the target dataset
570 -                           (i.e. attribute to be used for key computation)
571 -
572 -        distance_callback: distance callback. Default is euclidean distance.
573 -
574 -        weight: weight of the processing in a global distance matrix
575 -
576 -        matrix_normalized: Boolean. If matrix_normalized is True,
577 -                           the distance between two points is changed to
578 -                           a value between 0 (equal) and 1 (totaly different).
579 -                           To avoid useless computation and scale
580 -                           problems the following “normalization” is done:
581 -                                d = 1 - 1/(1 + d(x, y))
582 -
583 -        """
584 -        self.ref_attr_index = ref_attr_index
585 -        self.target_attr_index = target_attr_index
586 -        self.distance_callback = distance_callback
587 -        self.weight = weight
588 -        self.matrix_normalized = matrix_normalized
589 -
590 -    def distance(self, reference_record, target_record):
591 -        """ Compute the distance between two records
592 -
593 -        Parameters
594 -        ----------
595 -        reference_record: a record (tuple/list of values) of the reference dataset.
596 -
597 -        target_record: a record (tuple/list of values) of the target dataset.
598 -
599 -        """
600 -        refrecord = (reference_record[self.ref_attr_index] if self.ref_attr_index
601 -                     else reference_record)
602 -        targetrecord = (target_record[self.target_attr_index] if self.target_attr_index
603 -                        else target_record)
604 -        return self.distance_callback(refrecord, targetrecord)
605 -
606 -    def cdist(self, refset, targetset, ref_indexes=None, target_indexes=None):
607 -        """ Compute the metric matrix, given two datasets and a metric
608 -
609 -        Parameters
610 -        ----------
611 -        refset: a dataset (list of records)
612 -
613 -        targetset: a dataset (list of records)
614 -
615 -        Returns
616 -        -------
617 -
618 -        A distance matrix, of shape (len(refset), len(targetset))
619 -        with the distance of each element in it.
620 -        """
621 -        return cdist(self.distance, refset, targetset,
622 -                     matrix_normalized=self.matrix_normalized,
623 -                     ref_indexes=ref_indexes, target_indexes=target_indexes)
624 -
625 -    def pdist(self, dataset):
626 -        """ Compute the upper triangular matrix in a way similar
627 -        to scipy.spatial.metric
628 -
629 -        Parameters
630 -        ----------
631 -        dataset: a dataset (list of records)
632 -
633 -        Returns
634 -        -------
635 -
636 -        The values of the upper triangular distance matrix
637 -        (of shape (len(dataset), len(dataset)) with the distance of each element in it.
638 -        The values are sorted as row 1, row2, ...
639 -        """
640 -        values = []
641 -        for i in xrange(len(dataset)):
642 -            for j in xrange(i+1, len(dataset)):
643 -                d = 1
644 -                if dataset[i] and dataset[j]:
645 -                    d = self.distance(dataset[i], dataset[j])
646 -                    if self.matrix_normalized:
647 -                        d = 1 - (1.0/(1.0 + d))
648 -                values.append(d)
649 -        return values
650 -
651 -
652 -###############################################################################
653 -### CONCRETE PROCESSINGS #######################################################
654 -###############################################################################
655 -class LevenshteinProcessing(BaseProcessing):
656 -    """ A processing based on the levenshtein distance.
657 -    """
658 -
659 -    def __init__(self, ref_attr_index=None, target_attr_index=None,
660 -                 tokenizer=None, weight=1, matrix_normalized=False):
661 -        distance_callback = partial(levenshtein,
662 -                                    tokenizer=tokenizer)
663 -        super(LevenshteinProcessing, self).__init__(ref_attr_index,
664 -                                                   target_attr_index,
665 -                                                   distance_callback,
666 -                                                   weight,matrix_normalized)
667 -
668 -
669 -class GeographicalProcessing(BaseProcessing):
670 -    """ A processing based on the geographical distance.
671 -    """
672 -
673 -    def __init__(self, ref_attr_index=None, target_attr_index=None,
674 -                 in_radians=False, planet_radius=6371009, units='m', weight=1, matrix_normalized=False):
675 -        distance_callback = partial(geographical, in_radians=in_radians,
676 -                                    planet_radius=planet_radius, units=units)
677 -        super(GeographicalProcessing, self).__init__(ref_attr_index,
678 -                                                    target_attr_index,
679 -                                                    distance_callback,
680 -                                                    weight,matrix_normalized)
diff --git a/minhashing.py b/minhashing.py
@@ -1,184 +0,0 @@
681 -# -*- coding:utf-8 -*-
682 -# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
683 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
684 -#
685 -# This program is free software: you can redistribute it and/or modify it under
686 -# the terms of the GNU Lesser General Public License as published by the Free
687 -# Software Foundation, either version 2.1 of the License, or (at your option)
688 -# any later version.
689 -#
690 -# This program is distributed in the hope that it will be useful, but WITHOUT
691 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
692 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
693 -# details.
694 -#
695 -# You should have received a copy of the GNU Lesser General Public License along
696 -# with this program. If not, see <http://www.gnu.org/licenses/>.
697 -
698 -import cPickle
699 -
700 -from random import randint
701 -from collections import defaultdict
702 -
703 -import numpy as np
704 -from scipy.optimize import bisect
705 -
706 -from nazca.normalize import iter_wordgrams
707 -
708 -
709 -def randomhashfunction(zr):
710 -    """ Return a random hash function, mapping x in Z to ZR
711 -        h:x -> ax + b mod R
712 -
713 -    """
714 -    bound = max(zr - 1, 1)
715 -    a = randint(1, bound)
716 -    b = randint(1, bound)
717 -
718 -    def hashfunc(x):
719 -        return ((a*x + b)%zr)
720 -
721 -    return hashfunc
722 -
723 -
724 -class Minlsh(object):
725 -    """ Operate minhashing + locally-sensitive-hashing to find similar sentences
726 -    """
727 -
728 -    def __init__(self, verbose=False):
729 -        self._trained = False
730 -        self.sigmatrix = None
731 -        self._verbose = verbose
732 -
733 -    def train(self, sentences, k=2, siglen=200):
734 -        """ Train the minlsh on the given sentences.
735 -
736 -            - `k` is the length of the k-wordgrams used
737 -              (the lower k is, the faster is the training)
738 -            - `siglen` the length of the sentences signature
739 -
740 -        """
741 -
742 -        rows, shape = self._buildmatrixdocument(sentences, k)
743 -
744 -        if self._verbose: print "Training is done. Wait while signaturing"
745 -
746 -        self._computesignaturematrix(rows, shape, siglen)
747 -        self._trained = True
748 -
749 -
750 -    def _buildmatrixdocument(self, sentences, k):
751 -        """ Return a sparse matrix where :
752 -
753 -            - Each sentence is a column
754 -            - Each row is a element of the universal set
755 -
756 -            Each value (r, c) is set to 1 if the element at row r is in the
757 -            sentence c, 0 otherwise
758 -
759 -        """
760 -
761 -        rows, universe, sizeofuniverse = [], {}, 0
762 -        for nb, sent in enumerate(sentences):
763 -            row = []
764 -            for w in iter_wordgrams(sent, k):
765 -                row.append(universe.setdefault(w, sizeofuniverse))
766 -                if row[-1] == sizeofuniverse:
767 -                    sizeofuniverse += 1
768 -            rows.append(row)
769 -            if self._verbose and nb % 50000 == 0:
770 -                print nb
771 -
772 -        return rows, (len(rows), sizeofuniverse)
773 -
774 -    def _computesignaturematrix(self, rows, shape, siglen):
775 -        """ Return a matrix where each column is the signature the document
776 -            The signature is composed of `siglen` numbers
777 -
778 -            The more the documents have rows in commun, the closer they are.
779 -        """
780 -
781 -        nrows, ncols = shape
782 -        sig = np.empty((siglen, nrows))
783 -        #Generate the random hash functions
784 -        hashfunc = [randomhashfunction(ncols) for _ in xrange(siglen)]
785 -        #Compute hashing values just for once.
786 -        #Avoid multiple recomputations for the same column.
787 -        hashvalues = np.array([[hashfunc[i](r) for r in xrange(ncols)]
788 -                                for i in  xrange(siglen)])
789 -
790 -        docind = 0
791 -        while rows:
792 -            doc = rows.pop(0)
793 -            #Concatenate the needed rows.
794 -            tmp = np.dstack([hashvalues[:, r] for r in doc])
795 -            #Take the mininum of hashes
796 -            sig[:, docind] = np.min(tmp[0], 1)
797 -            docind += 1
798 -            if self._verbose and docind % 50000 == 0:
799 -                print (docind * 100) / nrows
800 -        self.sigmatrix = sig
801 -
802 -    def save(self, savefile):
803 -        """ Save the training into `savefile` for a future use """
804 -
805 -        if not self._trained:
806 -            print "Not trained, nothing to save"
807 -            return
808 -
809 -        with open(savefile, 'wb') as fobj:
810 -            pickler = cPickle.Pickler(fobj)
811 -            pickler.dump(self.sigmatrix)
812 -
813 -    def load(self, savefile):
814 -        """ Load a trained minhashing """
815 -
816 -        with open(savefile, 'rb') as fobj:
817 -            pickler = cPickle.Unpickler(fobj)
818 -            self.sigmatrix = pickler.load()
819 -
820 -        if self.sigmatrix is not None:
821 -            self._trained = True
822 -        else:
823 -            self._trained = False
824 -
825 -    def computebandsize(self, threshold, nbrows):
826 -        """ Compute the bandsize according to the threshold given """
827 -
828 -        ### t ~ (1/b)^(1/r), where t is the threshold, b the number of
829 -        ### bands, and r the number of rows per band. And nbrows (the length
830 -        ### of the matrix is nbrows = b*r, so t ~ (r/L)^(1/r). So, let's
831 -        ### find the root of f(x) = (x/L)^(1/r) - t.
832 -        def f(x):
833 -            y = pow(x/nbrows, 1. /x) - threshold
834 -            return y
835 -
836 -        ## Solve f(x) = 0, with x having values in [1, nbrows]
837 -        return int(bisect(f, 1, nbrows))
838 -
839 -    def predict(self, threshold):
840 -        """ Return a set of tuples of *possible* similar sentences
841 -        """
842 -        if not self._trained:
843 -            print "Train it before"
844 -            return
845 -
846 -        if not (0 < threshold <= 1):
847 -            print "Threshold must be in ]0 ; 1]"
848 -            return
849 -
850 -        sig = self.sigmatrix
851 -        # Treshold is a percent of similarity
852 -        # It should be inverted here (0 is closed, 1 is far)
853 -        threshold = 1 - threshold
854 -        bandsize = self.computebandsize(threshold, self.sigmatrix.shape[0])
855 -
856 -        buckets = defaultdict(set)
857 -        similars = set()
858 -        for r in xrange(0, sig.shape[0], bandsize):
859 -            buckets.clear()
860 -            for i in xrange(sig.shape[1]):
861 -                buckets[tuple(sig[r:r+bandsize, i])].add(i)
862 -            similars.update(set(tuple(v) for v in buckets.itervalues()
863 -                                         if len(v) > 1))
864 -        return similars
diff --git a/ner/dataio.py b/ner/dataio.py
@@ -1,140 +0,0 @@
865 -# -*- coding: utf-8 -*-
866 -""" IO for Named Entities Recognition.
867 -"""
868 -import json
869 -import urllib
870 -import lxml.etree as ET
871 -
872 -
873 -###############################################################################
874 -### SPARQL UTILITIES ##########################################################
875 -###############################################################################
876 -def sparql_query(query, endpoint):
877 -    """ Execute a query on an endpoint:
878 -
879 -    sparql_query(query=u'''SELECT ?uri ?type
880 -                           WHERE{
881 -                           ?uri rdfs:label "Python"@en .
882 -                           ?uri rdf:type ?type}''',
883 -                           endpoint=u'http://dbpedia.org/sparql')
884 -    """
885 -    from SPARQLWrapper import SPARQLWrapper, JSON
886 -    sparql = SPARQLWrapper(endpoint)
887 -    sparql.setQuery(query)
888 -    sparql.setReturnFormat(JSON)
889 -    try:
890 -        rawresults = sparql.query().convert()
891 -        labels = rawresults['head']['vars']
892 -        return rawresults["results"]["bindings"]
893 -    except:
894 -        print 'Error in sparql query'
895 -        return []
896 -
897 -
898 -###############################################################################
899 -### RQL UTILITIES #############################################################
900 -###############################################################################
901 -def get_cw_cnx(endpoint):
902 -    """ Get a cnx on a CubicWeb database
903 -    """
904 -    from cubicweb import dbapi
905 -    from cubicweb.cwconfig import CubicWebConfiguration
906 -    from cubicweb.entities import AnyEntity
907 -    CubicWebConfiguration.load_cwctl_plugins()
908 -    config = CubicWebConfiguration.config_for(endpoint)
909 -    sourceinfo = config.sources()['admin']
910 -    login = sourceinfo['login']
911 -    password = sourceinfo['password']
912 -    _, cnx = dbapi.in_memory_repo_cnx(config, login, password=password)
913 -    req = cnx.request()
914 -    return req
915 -
916 -def rql_appid_query(query, endpoint, _cache_cnx={}, **kwargs):
917 -    """ Execute a query on an appid endpoint:
918 -
919 -    rql_query('Any X WHERE X label "Python"', 'localhost')
920 -
921 -    Additional arguments can be passed to be properly substitued
922 -    in the execute() function.
923 -    """
924 -    if endpoint in _cache_cnx:
925 -        cnx = _cache_cnx[endpoint]
926 -    else:
927 -        cnx = get_cw_cnx(endpoint)
928 -        _cache_cnx[endpoint] = cnx
929 -    return cnx.execute(query, kwargs)
930 -
931 -def rql_url_query(query, endpoint):
932 -    """ Execute a query on an url endpoint:
933 -
934 -    rql_query('Any X WHERE X label "Python"', 'localhost')
935 -    """
936 -    url = urllib.basejoin(endpoint, '?rql=%s&vid=jsonexport' % query)
937 -    return json.loads(urllib.urlopen(url).read())
938 -
939 -
940 -###############################################################################
941 -### OUTPUT UTILITIES ##########################################################
942 -###############################################################################
943 -class AbstractNerdyPrettyPrint(object):
944 -    """ Pretty print the output of a Nerdy process
945 -    """
946 -
947 -    def pprint_text(self, text, named_entities, **kwargs):
948 -        newtext = u''
949 -        indice = 0
950 -        tindices = dict([(t.start, (uri, t)) for uri, p, t in named_entities])
951 -        while indice < len(text):
952 -            if indice in tindices:
953 -                uri, t = tindices[indice]
954 -                words = text[t.start:t.end]
955 -                fragment = self.pprint_entity(uri, words, **kwargs)
956 -                if not self.is_valid(newtext+fragment+text[t.end:]):
957 -                    fragment = words
958 -                newtext += fragment
959 -                indice = t.end
960 -            else:
961 -                newtext += text[indice]
962 -                indice += 1
963 -        return newtext
964 -
965 -    def pprint_entity(self, uri, word, **kwargs):
966 -        """ Pretty print an entity """
967 -        raise NotImplementedError
968 -
969 -    def is_valid(self, newtext):
970 -        """Override to check the validity of the prettified content at each
971 -        enrichement step"""
972 -        return True
973 -
974 -
975 -class NerdyHTMLPrettyPrint(AbstractNerdyPrettyPrint):
976 -    """ Pretty print the output of a Nerdy process
977 -    """
978 -
979 -    def pprint_entity(self, uri, word, **kwargs):
980 -        """ Pretty print an entity """
981 -        klass = ' class="%s"' % kwargs['html_class'] if 'html_class' in kwargs else ''
982 -        return u'<a href="%s"%s>%s</a>' % (uri, klass, word)
983 -
984 -
985 -class NerdyValidXHTMLPrettyPrint(NerdyHTMLPrettyPrint):
986 -
987 -    XHTML_DOC_TEMPLATE = '''\
988 -<?xml version="1.0" encoding="UTF-8" ?>
989 -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
990 -<html xmlns="http://www.w3.org/1999/xhtml">
991 -<head>
992 -<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
993 -<title>nerdy</title>
994 -</head>
995 -<body><div>%s</div></body>
996 -</html>'''
997 -
998 -    def is_valid(self, html):
999 -        try:
1000 -            ET.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
1001 -                          parser=ET.XMLParser(dtd_validation=True))
1002 -        except ET.XMLSyntaxError:
1003 -            return False
1004 -        return True
diff --git a/normalize.py b/normalize.py
@@ -1,415 +0,0 @@
1005 -# -*- coding:utf-8 -*-
1006 -# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1007 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1008 -#
1009 -# This program is free software: you can redistribute it and/or modify it under
1010 -# the terms of the GNU Lesser General Public License as published by the Free
1011 -# Software Foundation, either version 2.1 of the License, or (at your option)
1012 -# any later version.
1013 -#
1014 -# This program is distributed in the hope that it will be useful, but WITHOUT
1015 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1016 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1017 -# details.
1018 -#
1019 -# You should have received a copy of the GNU Lesser General Public License along
1020 -# with this program. If not, see <http://www.gnu.org/licenses/>.
1021 -
1022 -import re
1023 -from string import punctuation
1024 -from warnings import warn
1025 -from unicodedata import normalize as _uninormalize
1026 -from functools import partial
1027 -
1028 -
1029 -FRENCH_STOPWORDS = set([u'alors', u'au', u'aux', u'aucuns', u'aussi', u'autre', u'avant',
1030 -u'avec', u'avoir', u'bon', u'car', u'ce', u'cela', u'ces', u'ceux', u'chaque',
1031 -u'ci', u'comme', u'comment', u'dans', u'de', u'des', u'du', u'dedans', u'dehors',
1032 -u'depuis', u'deux', u'devrait', u'doit', u'donc', u'dos', u'droite', u'début',
1033 -u'elle', u'elles', u'en', u'encore', u'essai', u'est', u'et', u'eu', u'eux', u'fait',
1034 -u'faites', u'fois', u'font', u'force', u'haut', u'hors', u'ici', u'il', u'ils',
1035 -u'je', u'juste', u'la', u'le', u'les', u'leur', u'lui', u'là', u'ma', u'maintenant',
1036 -u'mais', u'me', u'mes', u'moi', u'moins', u'mon', u'mot', u'même', u'ne',
1037 -u'ni', u'nommés', u'nos',
1038 -u'notre', u'nous', u'nouveaux', u'on', u'ou', u'où', u'par', u'parce', u'parole',
1039 -u'pas', u'personnes', u'peut', u'peu', u'pièce', u'plupart', u'pour',
1040 -u'pourquoi', u'quand', u'que', u'quel', u'quelle', u'quelles', u'quels', u'qui',
1041 -u'sa', u'sans', u'se', u'ses', u'seulement', u'si', u'sien', u'son', u'sont', u'sous',
1042 -u'soyez', u'sujet', u'sur', u'ta', u'tandis', u'tellement', u'te', u'tels', u'tes', u'toi',
1043 -u'ton', u'tous', u'tout', u'trop', u'très', u'tu', u'un', u'une', u'valeur', u'voie',
1044 -u'voient', u'vont', u'vos', u'votre', u'vous', u'vu', u'ça', u'étaient', u'état',
1045 -u'étions', u'été', u'être'])
1046 -
1047 -MANUAL_UNICODE_MAP = {
1048 -    u'\xa1': u'!',    # INVERTED EXCLAMATION MARK
1049 -    u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE
1050 -    u'\u2044': u'/',  # FRACTION SLASH
1051 -    u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE
1052 -    u'\xa9': u'(c)',  # COPYRIGHT SIGN
1053 -    u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
1054 -    u'\xe6': u'ae',   # LATIN SMALL LETTER AE
1055 -    u'\xae': u'(r)',  # REGISTERED SIGN
1056 -    u'\u0153': u'oe', # LATIN SMALL LIGATURE OE
1057 -    u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE
1058 -    u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE
1059 -    u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE
1060 -    u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
1061 -    u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S
1062 -    }
1063 -
1064 -
1065 -###############################################################################
1066 -### NORMALIZE FUNCTIONS #######################################################
1067 -###############################################################################
1068 -def unormalize(ustring, substitute=None):
1069 -    """replace diacritical characters with their corresponding ascii characters
1070 -
1071 -    Convert the unicode string to its long normalized form (unicode character
1072 -    will be transform into several characters) and keep the first one only.
1073 -    The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
1074 -    replace all compatibility characters with their equivalents.
1075 -
1076 -    :type substitute: str
1077 -    :param substitute: replacement character to use if decomposition fails
1078 -
1079 -    :see: Another project about ASCII transliterations of Unicode text
1080 -          http://pypi.python.org/pypi/Unidecode
1081 -    """
1082 -    res = []
1083 -    for letter in ustring[:]:
1084 -        try:
1085 -            replacement = MANUAL_UNICODE_MAP[letter]
1086 -        except KeyError:
1087 -            if isinstance(letter, unicode):
1088 -                replacement = _uninormalize('NFKD', letter)[0]
1089 -            else:
1090 -                replacement = letter
1091 -            if ord(replacement) >= 2 ** 7:
1092 -                if substitute is None:
1093 -                    raise ValueError("can't deal with non-ascii based characters")
1094 -                replacement = substitute
1095 -        res.append(replacement)
1096 -    return u''.join(res)
1097 -
1098 -def lunormalize(sentence, substitute=None):
1099 -    """ Normalize a sentence (ie remove accents, set to lower, etc) """
1100 -    return unormalize(sentence,substitute).lower()
1101 -
1102 -def simplify(sentence, lemmas=None, remove_stopwords=True, stopwords=FRENCH_STOPWORDS):
1103 -    """ Simply the given sentence
1104 -        0) If remove_stopwords, then remove the stop words
1105 -        1) If lemmas are given, the sentence is lemmatized
1106 -        2) Set the sentence to lower case
1107 -        3) Remove punctuation
1108 -    """
1109 -    if not isinstance(sentence, basestring):
1110 -        return sentence
1111 -
1112 -    if lemmas:
1113 -        sentence = lemmatized(sentence, lemmas)
1114 -    sentence = sentence.lower()
1115 -    cleansent = ''.join([s if s not in punctuation
1116 -                           else ' ' for s in sentence]).strip()
1117 -    #comma followed by a space is replaced by two spaces, keep only one
1118 -    cleansent = cleansent.replace('  ', ' ')
1119 -
1120 -    if not remove_stopwords:
1121 -        return cleansent
1122 -    else:
1123 -        return ' '.join([w for w in cleansent.split(' ') if w not in stopwords])
1124 -
1125 -def tokenize(sentence, tokenizer=None, regexp=re.compile(r"[^\s]+")):
1126 -    """ Tokenize a sentence.
1127 -        Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
1128 -        in case of failure, it just split on spaces.
1129 -
1130 -        Anyway, tokenizer must have a ``tokenize()`` method
1131 -    """
1132 -    if tokenizer:
1133 -        return tokenizer().tokenize(sentence)
1134 -    # XXX Unicode, could not use WorkTokenizer.
1135 -    # Instead split on whitespaces
1136 -    chunks = []
1137 -    for chunk in [t for t in regexp.findall(sentence) if t]:
1138 -        # Deals with '
1139 -        if "'" in chunk:
1140 -            schunks = chunk.split("'")
1141 -            chunks.extend([c+"'" for c in schunks[:-1]])
1142 -            chunks.append(schunks[-1])
1143 -        else:
1144 -            chunks.append(chunk)
1145 -    return chunks
1146 -
1147 -def iter_wordgrams(sentence, k):
1148 -    """ Generator of k-wordgrams on the given sentence
1149 -    """
1150 -    words = sentence.split(' ')
1151 -    #XXX Call tokenizer
1152 -    for r in xrange(len(words)):
1153 -        yield ' '.join(words[r:r + k])
1154 -
1155 -def loadlemmas(filename, encoding='utf-8'):
1156 -    """ Return the default lemmas dictionnary
1157 -    """
1158 -    lemmas = {}
1159 -    with open(filename) as fobj:
1160 -        for line in fobj:
1161 -            line = line.decode(encoding).strip().split('\t')
1162 -            if len(line) == 2:
1163 -                lemmas[line[0]] = line[1]
1164 -    return lemmas
1165 -
1166 -def lemmatized(sentence, lemmas, tokenizer=None):
1167 -    """ Return the lemmatized sentence
1168 -    """
1169 -    tokenized_sent = tokenize(sentence, tokenizer)
1170 -    tokenized_sentformated = []
1171 -    for w in tokenized_sent:
1172 -        if w in ".,'" and len(tokenized_sentformated) > 0:
1173 -            tokenized_sentformated[-1] += w
1174 -        elif w not in punctuation:
1175 -            tokenized_sentformated.append(w)
1176 -    return u' '.join([lemmatized_word(w, lemmas) for w in tokenized_sentformated])
1177 -
1178 -def lemmatized_word(word, lemmas):
1179 -    """ Return the lemmatized word
1180 -    """
1181 -    lemma = lemmas.get(word.lower(), word)
1182 -    if '|' in lemma:
1183 -        _words = lemma.split('|')
1184 -        if word.lower() in _words:
1185 -            lemma = word.lower()
1186 -        else:
1187 -            lemma = _words[0]
1188 -    return lemma
1189 -
1190 -def roundstr(number, ndigits=0):
1191 -    """Return an unicode string of ``number`` rounded to a given precision
1192 -        in decimal digits (default 0 digits)
1193 -
1194 -        If ``number`` is not a float, this method casts it to a float. (An
1195 -        exception may be raised if it's not possible)
1196 -    """
1197 -    return format(round(float(number), ndigits), '0.%df' % ndigits)
1198 -
1199 -def rgxformat(string, regexp, output):
1200 -    """ Apply the regexp to the ``string`` and return a formatted string
1201 -    according to ``output``
1202 -
1203 -    eg :
1204 -        format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
1205 -               r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
1206 -               r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
1207 -               u'%(lastname)s, %(firstname)s (%(birthdate)s -'
1208 -               u'%(deathdate)s)')
1209 -
1210 -     would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
1211 -     """
1212 -
1213 -    match = re.match(regexp, string)
1214 -    return output % match.groupdict()
1215 -
1216 -
1217 -###############################################################################
1218 -### NORMALIZER OBJECTS ########################################################
1219 -###############################################################################
1220 -class BaseNormalizer(object):
1221 -    """ A normalizer object used to provide an abstraction over the different
1222 -    normalization functions, and help building Nazca process. """
1223 -
1224 -    def __init__(self, callback, attr_index=None):
1225 -        """ Initiate the BaseNormalizer
1226 -
1227 -        Parameters
1228 -        ----------
1229 -        callback: normalization callback
1230 -
1231 -        attr_index: index of the attribute of interest in a record
1232 -                    (i.e. attribute to be normalized).
1233 -                    By default, 'attr_index' is None and the whole
1234 -                    record is passed to the callback.
1235 -                    If given, only the attr_index value of the record
1236 -                    is passed to the the callback.
1237 -                    Could be a list or an int
1238 -        """
1239 -        self.callback = callback
1240 -        if attr_index:
1241 -            self.attr_index = attr_index if isinstance(attr_index, (tuple, list)) else (attr_index,)
1242 -        else:
1243 -            self.attr_index = attr_index
1244 -
1245 -    def normalize(self, record):
1246 -        """ Normalize a record
1247 -
1248 -        Parameters
1249 -        ----------
1250 -        record: a record (tuple/list of values).
1251 -
1252 -        Returns
1253 -        -------
1254 -
1255 -        record: the normalized record.
1256 -        """
1257 -        if not self.attr_index:
1258 -            return self.callback(record)
1259 -        else:
1260 -            for attr_ind in self.attr_index:
1261 -                record = list(r if ind != attr_ind else self.callback(r)
1262 -                               for ind, r in enumerate(record))
1263 -            return record
1264 -
1265 -    def normalize_dataset(self, dataset, inplace=False):
1266 -        """ Normalize a dataset
1267 -
1268 -        Parameters
1269 -        ----------
1270 -        dataset: a list of record (tuple/list of values).
1271 -
1272 -        inplace: Boolean. If True, normalize the dataset in place.
1273 -
1274 -        Returns
1275 -        -------
1276 -
1277 -        record: the normalized dataset.
1278 -        """
1279 -        if not inplace:
1280 -            dataset = [self.normalize(record) for record in dataset]
1281 -        else:
1282 -            # Change dataset in place
1283 -            for ind, record in enumerate(dataset):
1284 -                dataset[ind] = self.normalize(record)
1285 -        return dataset
1286 -
1287 -
1288 -class UnicodeNormalizer(BaseNormalizer):
1289 -    """ Normalizer that unormalize the unicode
1290 -    (i.e. replace accentuating characters by ASCII ones)
1291 -    """
1292 -    def __init__(self, attr_index=None, substitute=None):
1293 -        callback = partial(lunormalize, substitute=substitute)
1294 -        super(UnicodeNormalizer, self).__init__(callback, attr_index=attr_index)
1295 -
1296 -
1297 -class SimplifyNormalizer(BaseNormalizer):
1298 -    """ Normalizer that simplify a string
1299 -        0) If remove_stopwords, then remove the stop words
1300 -        1) If lemmas are given, the sentence is lemmatized
1301 -        2) Set the sentence to lower case
1302 -        3) Remove punctuation
1303 -    """
1304 -    def __init__(self, attr_index=None, lemmas=None, remove_stopwords=True):
1305 -        callback = partial(simplify, lemmas=lemmas, remove_stopwords=remove_stopwords)
1306 -        super(SimplifyNormalizer, self).__init__(callback, attr_index=attr_index)
1307 -
1308 -
1309 -class TokenizerNormalizer(BaseNormalizer):
1310 -    """ Normalizer that tokenize a string
1311 -        Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
1312 -        in case of failure, it just split on spaces.
1313 -        Anyway, tokenizer must have a ``tokenize()`` method
1314 -    """
1315 -    def __init__(self, attr_index=None, tokenizer=None, regexp=re.compile(r"[^\s]+")):
1316 -        callback = partial(tokenize, tokenizer=tokenizer, regexp=regexp)
1317 -        super(TokenizerNormalizer, self).__init__(callback, attr_index=attr_index)
1318 -
1319 -
1320 -class LemmatizerNormalizer(BaseNormalizer):
1321 -    """ Normalizer that lemmatize a string
1322 -    """
1323 -    def __init__(self, lemmas, attr_index=None, tokenizer=None):
1324 -        callback = partial(lemmatized, lemmas=lemmas, tokenizer=tokenizer)
1325 -        super(LemmatizerNormalizer, self).__init__(callback, attr_index=attr_index)
1326 -
1327 -
1328 -class RoundNormalizer(BaseNormalizer):
1329 -    """Normalizer that round a string
1330 -    Return an unicode string of ``number`` rounded to a given precision
1331 -    in decimal digits (default 0 digits)
1332 -
1333 -    If ``number`` is not a float, this method casts it to a float. (An
1334 -    exception may be raised if it's not possible)
1335 -    """
1336 -    def __init__(self, attr_index=None, ndigits=0):
1337 -        callback = partial(roundstr, ndigits=ndigits)
1338 -        super(RoundNormalizer, self).__init__(callback, attr_index=attr_index)
1339 -
1340 -
1341 -class RegexpNormalizer(BaseNormalizer):
1342 -    """Normalizer that normalize a string based on a regexp
1343 -
1344 -     Apply the regexp to the ``string`` and return a formatted string
1345 -    according to ``output``
1346 -
1347 -    eg :
1348 -        format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
1349 -               r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
1350 -               r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
1351 -               u'%(lastname)s, %(firstname)s (%(birthdate)s -'
1352 -               u'%(deathdate)s)')
1353 -
1354 -     would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
1355 -    """
1356 -    def __init__(self, regexp, output, attr_index=None):
1357 -        callback = partial(rgxformat, regexp=regexp, output=output)
1358 -        super(RegexpNormalizer, self).__init__(callback, attr_index=attr_index)
1359 -
1360 -
1361 -###############################################################################
1362 -### JOIN NORMALIZER ###########################################################
1363 -###############################################################################
1364 -class JoinNormalizer(BaseNormalizer):
1365 -    """Normalizer that join multiple fields in only one.
1366 -    This new field will be put at the end of the new record.
1367 -    """
1368 -    def __init__(self, attr_indexes, join_car=', '):
1369 -        self.attr_indexes = attr_indexes
1370 -        self.join_car = join_car
1371 -
1372 -    def normalize(self, record):
1373 -        """ Normalize a record
1374 -
1375 -        Parameters
1376 -        ----------
1377 -        record: a record (tuple/list of values).
1378 -
1379 -        Returns
1380 -        -------
1381 -
1382 -        record: the normalized record.
1383 -        """
1384 -        _record = [r for ind, r in enumerate(record) if ind not in self.attr_indexes]
1385 -        _record.append(self.join_car.join([r for ind, r in enumerate(record) if ind in self.attr_indexes]))
1386 -        return _record
1387 -
1388 -
1389 -###############################################################################
1390 -### NORMALIZER PIPELINE #######################################################
1391 -###############################################################################
1392 -class NormalizerPipeline(BaseNormalizer):
1393 -    """ Pipeline of Normalizers
1394 -    """
1395 -
1396 -    def __init__(self, normalizers):
1397 -        """ Initiate the NormalizerPipeline
1398 -
1399 -        Parameters
1400 -        ----------
1401 -        normalizers: list (ordered) of Normalizer
1402 -        """
1403 -        self.normalizers = normalizers
1404 -
1405 -    def normalize(self, record):
1406 -        """ Normalize a record
1407 -
1408 -        Parameters
1409 -        ----------
1410 -        record: a record (tuple/list of values).
1411 -
1412 -        Returns
1413 -        -------
1414 -
1415 -        record: the normalized record.
1416 -        """
1417 -        for normalizer in self.normalizers:
1418 -            record = normalizer.normalize(record)
1419 -        return record
diff --git a/record_linkage/aligner.py b/record_linkage/aligner.py
@@ -19,11 +19,11 @@
1420  from collections import defaultdict
1421 
1422  from scipy import zeros
1423  from scipy.sparse import lil_matrix
1424 
1425 -from nazca.dataio import parsefile
1426 +from nazca.utils.dataio import parsefile
1427 
1428 
1429  ###############################################################################
1430  ### UTILITY FUNCTIONS #########################################################
1431  ###############################################################################
diff --git a/record_linkage/blocking.py b/record_linkage/blocking.py
@@ -30,12 +30,12 @@
1432  from functools import partial
1433  import warnings
1434 
1435  from scipy.spatial import KDTree
1436 
1437 -from nazca.minhashing import Minlsh
1438 -from nazca.distances import soundexcode
1439 +from nazca.utils.minhashing import Minlsh
1440 +from nazca.utils.distances import soundexcode
1441 
1442 
1443  ###############################################################################
1444  ### GENERAL BLOCKING ##########################################################
1445  ###############################################################################
diff --git a/test/test_alignment.py b/test/test_alignment.py
@@ -19,14 +19,14 @@
1446  import unittest2
1447  import random
1448  random.seed(6) ### Make sure tests are repeatable
1449  from os import path
1450 
1451 -from nazca.normalize import simplify
1452 +from nazca.utils.normalize import simplify
1453  import nazca.record_linkage.aligner as alig
1454  import nazca.record_linkage.blocking as blo
1455 -from nazca.distances import LevenshteinProcessing, GeographicalProcessing
1456 +from nazca.utils.distances import LevenshteinProcessing, GeographicalProcessing
1457 
1458 
1459  TESTDIR = path.dirname(__file__)
1460 
1461 
diff --git a/test/test_blocking.py b/test/test_blocking.py
@@ -19,18 +19,18 @@
1462  from os import path
1463  from functools import partial
1464  import random
1465  random.seed(6) ### Make sure tests are repeatable / Minhashing
1466 
1467 -from nazca.distances import (levenshtein, soundex, soundexcode,   \
1468 -                             jaccard, euclidean, geographical)
1469 +from nazca.utils.distances import (levenshtein, soundex, soundexcode,   \
1470 +                                       jaccard, euclidean, geographical)
1471  from nazca.record_linkage.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
1472                                             MergeBlocking,
1473                                             NGramBlocking, PipelineBlocking,
1474                                             SoundexBlocking, KmeansBlocking,
1475                                             MinHashingBlocking, KdTreeBlocking)
1476 -from nazca.normalize import SimplifyNormalizer, loadlemmas
1477 +from nazca.utils.normalize import SimplifyNormalizer, loadlemmas
1478 
1479 
1480  TESTDIR = path.dirname(__file__)
1481 
1482  SOUNDEX_REFSET = (('a1', 'smith'),
diff --git a/test/test_dataio.py b/test/test_dataio.py
@@ -20,11 +20,11 @@
1483  import shutil
1484  from contextlib import contextmanager
1485  from os import path
1486  from tempfile import mkdtemp
1487 
1488 -from nazca.dataio import sparqlquery, parsefile, autocast, split_file
1489 +from nazca.utils.dataio import sparqlquery, parsefile, autocast, split_file
1490 
1491 
1492  TESTDIR = path.dirname(__file__)
1493 
1494  @contextmanager
diff --git a/test/test_distances.py b/test/test_distances.py
@@ -19,13 +19,13 @@
1495  import unittest2
1496  import random
1497  random.seed(6) ### Make sure tests are repeatable
1498  from dateutil import parser as dateparser
1499 
1500 -from nazca.distances import (levenshtein, soundex, soundexcode,\
1501 -                             jaccard, euclidean, geographical,
1502 -                             LevenshteinProcessing)
1503 +from nazca.utils.distances import (levenshtein, soundex, soundexcode,
1504 +                                   jaccard, euclidean, geographical,
1505 +                                   LevenshteinProcessing)
1506 
1507 
1508  class DistancesTest(unittest2.TestCase):
1509      def test_levenshtein(self):
1510          self.assertEqual(levenshtein('niche', 'chiens'), 5)
diff --git a/test/test_minhashing.py b/test/test_minhashing.py
@@ -19,12 +19,12 @@
1511  import unittest2
1512  from os import path
1513  import random
1514  random.seed(6) ### Make sure tests are repeatable
1515 
1516 -from nazca.normalize import loadlemmas, simplify
1517 -from nazca.minhashing import Minlsh
1518 +from nazca.utils.normalize import loadlemmas, simplify
1519 +from nazca.utils.minhashing import Minlsh
1520 
1521  TESTDIR = path.dirname(__file__)
1522 
1523 
1524 
diff --git a/test/test_normalize.py b/test/test_normalize.py
@@ -17,16 +17,16 @@
1525  # with this program. If not, see <http://www.gnu.org/licenses/>.
1526 
1527  import unittest2
1528  from os import path
1529 
1530 -from nazca.normalize import (BaseNormalizer, UnicodeNormalizer, JoinNormalizer,
1531 -                             SimplifyNormalizer, TokenizerNormalizer,
1532 -                             LemmatizerNormalizer, RoundNormalizer,
1533 -                             RegexpNormalizer, NormalizerPipeline,
1534 -                             lunormalize, loadlemmas, lemmatized, \
1535 -                             roundstr, rgxformat, tokenize, simplify)
1536 +from nazca.utils.normalize import (BaseNormalizer, UnicodeNormalizer, JoinNormalizer,
1537 +                                   SimplifyNormalizer, TokenizerNormalizer,
1538 +                                   LemmatizerNormalizer, RoundNormalizer,
1539 +                                   RegexpNormalizer, NormalizerPipeline,
1540 +                                   lunormalize, loadlemmas, lemmatized,
1541 +                                   roundstr, rgxformat, tokenize, simplify)
1542 
1543 
1544  TESTDIR = path.dirname(__file__)
1545 
1546 
diff --git a/utils/__init__.py b/utils/__init__.py
diff --git a/utils/dataio.py b/utils/dataio.py
@@ -0,0 +1,224 @@
1547 +# -*- coding:utf-8 -*-
1548 +# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1549 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1550 +#
1551 +# This program is free software: you can redistribute it and/or modify it under
1552 +# the terms of the GNU Lesser General Public License as published by the Free
1553 +# Software Foundation, either version 2.1 of the License, or (at your option)
1554 +# any later version.
1555 +#
1556 +# This program is distributed in the hope that it will be useful, but WITHOUT
1557 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1558 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1559 +# details.
1560 +#
1561 +# You should have received a copy of the GNU Lesser General Public License along
1562 +# with this program. If not, see <http://www.gnu.org/licenses/>.
1563 +
1564 +from os.path import exists as fileexists
1565 +from os import path as osp
1566 +
1567 +import csv
1568 +import urllib
1569 +
1570 +try:
1571 +    from SPARQLWrapper import SPARQLWrapper, JSON
1572 +    SPARQL_ENABLED = True
1573 +except ImportError:
1574 +    SPARQL_ENABLED = False
1575 +
1576 +
1577 +###############################################################################
1578 +### UTILITY FUNCTIONS #########################################################
1579 +###############################################################################
1580 +def autocast(data, encoding=None):
1581 +    """ Try to convert data into a specific type
1582 +    in (int, float, str)
1583 +    """
1584 +    try:
1585 +        return int(data)
1586 +    except ValueError:
1587 +        try:
1588 +            return float(data.replace(',', '.'))
1589 +        except ValueError:
1590 +            data = data.strip()
1591 +            if encoding:
1592 +                return data.decode(encoding)
1593 +            return data
1594 +
1595 +
1596 +###############################################################################
1597 +### RQL FUNCTIONS #############################################################
1598 +###############################################################################
1599 +def rqlquery(host, rql, indexes=None, formatopt=None):
1600 +    """ Run the rql query on the given cubicweb host
1601 +    """
1602 +
1603 +    if host.endswith('/'):
1604 +        host = host[:-1]
1605 +
1606 +    indexes = indexes or []
1607 +    filehandle = urllib.urlopen('%(host)s/view?'
1608 +                                'rql=%(rql)s&vid=csvexport'
1609 +                                % {'rql': rql, 'host': host})
1610 +    filehandle.readline()#Skip the first line
1611 +    return parsefile(filehandle, delimiter=';', indexes=indexes,
1612 +                     formatopt=formatopt);
1613 +
1614 +
1615 +###############################################################################
1616 +### SPARQL FUNCTIONS ##########################################################
1617 +###############################################################################
1618 +def sparqlquery(endpoint, query, indexes=None, autocaste_data=True):
1619 +    """ Run the sparql query on the given endpoint, and wrap the items in the
1620 +    indexes form. If indexes is empty, keep raw output"""
1621 +
1622 +    if not SPARQL_ENABLED:
1623 +        raise ImportError("You have to install SPARQLWrapper and JSON modules to"
1624 +                          "used this function")
1625 +
1626 +    sparql = SPARQLWrapper(endpoint)
1627 +    sparql.setQuery(query)
1628 +    sparql.setReturnFormat(JSON)
1629 +    rawresults = sparql.query().convert()
1630 +    labels = rawresults['head']['vars']
1631 +    results = []
1632 +    indexes = indexes or []
1633 +    if autocaste_data:
1634 +        transform = autocast
1635 +    else:
1636 +        def transform(*args): return args
1637 +    for raw in rawresults["results"]["bindings"]:
1638 +        data = []
1639 +        if not indexes:
1640 +            data = [transform(raw[label]['value']) for label in labels]
1641 +        else:
1642 +            for il, ind in enumerate(indexes):
1643 +                if isinstance(ind, tuple):
1644 +                    data.append(tuple([transform(raw[labels[i]]['value']) for i in ind]))
1645 +                else:
1646 +                    data.append(transform(raw[labels[il]]['value']))
1647 +        results.append(data)
1648 +    return results
1649 +
1650 +
1651 +###############################################################################
1652 +### FILE FUNCTIONS ############################################################
1653 +###############################################################################
1654 +def parsefile(filename, indexes=None, nbmax=None, delimiter='\t',
1655 +              encoding='utf-8', field_size_limit=None, formatopt=None):
1656 +    """ Parse the file (read ``nbmax`` line at maximum if given). Each
1657 +        line is splitted according ``delimiter`` and only ``indexes`` are kept
1658 +
1659 +        eg : The file is :
1660 +                1, house, 12, 19, apple
1661 +                2, horse, 21.9, 19, stramberry
1662 +                3, flower, 23, 2.17, cherry
1663 +
1664 +            >>> data = parsefile('myfile', [0, (2, 3), 4, 1], delimiter=',')
1665 +            data = [[1, (12, 19), u'apple', u'house'],
1666 +                    [2, (21.9, 19), u'stramberry', u'horse'],
1667 +                    [3, (23, 2.17), u'cherry', u'flower']]
1668 +
1669 +            By default, all cells are "autocast" (thanks to the
1670 +            ``autocast()`` function), but you can overpass it thanks to the
1671 +            ``formatopt`` dictionnary. Each key is the index to work on, and the
1672 +            value is the function to call. See the following example:
1673 +
1674 +            >>> data = parsefile('myfile', [0, (2, 3), 4, 1], delimiter=',',
1675 +            >>>                  formatopt={2:lambda x:x.decode('utf-8')})
1676 +            data = [[1, (u'12', 19), u'apple', u'house'],
1677 +                    [2, (u'21.9', 19), u'stramberry', u'horse'],
1678 +                    [3, (u'23', 2.17), u'cherry', u'flower']]
1679 +
1680 +    """
1681 +    def formatedoutput(filename):
1682 +        if field_size_limit:
1683 +            csv.field_size_limit(field_size_limit)
1684 +
1685 +        if isinstance(filename, basestring):
1686 +            csvfile = open(filename, 'r')
1687 +        else:
1688 +            csvfile = filename
1689 +        reader = csv.reader(csvfile, delimiter=delimiter)
1690 +        for row in reader:
1691 +            yield [cell.strip() for cell in row]
1692 +        csvfile.close()
1693 +
1694 +
1695 +
1696 +    result = []
1697 +    indexes = indexes or []
1698 +    formatopt = formatopt or {}
1699 +    for ind, row in enumerate(formatedoutput(filename)):
1700 +        row = [formatopt.get(i, lambda x: autocast(x, encoding))(cell)
1701 +               for i, cell in enumerate(row)]
1702 +        data = []
1703 +        if nbmax and ind > nbmax:
1704 +            break
1705 +        if not indexes:
1706 +            data = row
1707 +        else:
1708 +            for ind in indexes:
1709 +                if isinstance(ind, tuple):
1710 +                    data.append(tuple([row[i] for i in ind]))
1711 +                    if '' in data[-1]:
1712 +                        data[-1] = None
1713 +                elif row[ind]:
1714 +                    data.append(row[ind])
1715 +                else:
1716 +                    data.append(None)
1717 +
1718 +        result.append(data)
1719 +    return result
1720 +
1721 +def write_results(matched, alignset, targetset, resultfile):
1722 +    """ Given a matched dictionnay, an alignset and a targetset to the
1723 +        resultfile
1724 +    """
1725 +    openmode = 'a' if fileexists(resultfile) else 'w'
1726 +    with open(resultfile, openmode) as fobj:
1727 +        if openmode == 'w':
1728 +            fobj.write('aligned;targetted;distance\n')
1729 +        for aligned in matched:
1730 +            for target, dist in matched[aligned]:
1731 +                alignid = alignset[aligned][0]
1732 +                targetid = targetset[target][0]
1733 +                fobj.write('%s;%s;%s\n' %
1734 +                    (alignid.encode('utf-8') if isinstance(alignid, basestring)
1735 +                                             else alignid,
1736 +                     targetid.encode('utf-8') if isinstance(targetid, basestring)
1737 +                                              else targetid,
1738 +                     dist
1739 +                     ))
1740 +
1741 +def split_file(filename, outputdir, nblines=60000):
1742 +    """ Split `filename` into smaller files of ``nblines`` lines. Files are
1743 +        written into `outputdir`.
1744 +
1745 +        Return the list of files
1746 +    """
1747 +    NEW = object()
1748 +
1749 +    def readlines(fobj, nblines):
1750 +        """ yield all lines of the file, and
1751 +        at split-file boundaries, yield a NEW marker
1752 +        """
1753 +        for index, line in enumerate(fobj):
1754 +            if index and index % nblines == 0:
1755 +                yield NEW
1756 +            yield line
1757 +
1758 +    count = 0
1759 +    with open(filename, 'rb') as fobj:
1760 +        outfile = open(osp.join(outputdir, '%s' % count), 'wb')
1761 +        for line in readlines(fobj, nblines):
1762 +            if line is NEW:
1763 +                outfile.close()
1764 +                count += 1
1765 +                outfile = open(osp.join(outputdir, '%s' % count), 'wb')
1766 +                continue
1767 +            outfile.write(line)
1768 +        outfile.close()
1769 +        count += 1
1770 +    return map(str, xrange(count))
diff --git a/utils/distances.py b/utils/distances.py
@@ -0,0 +1,456 @@
1771 +# -*- coding:utf-8 -*-
1772 +# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1773 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1774 +#
1775 +# This program is free software: you can redistribute it and/or modify it under
1776 +# the terms of the GNU Lesser General Public License as published by the Free
1777 +# Software Foundation, either version 2.1 of the License, or (at your option)
1778 +# any later version.
1779 +#
1780 +# This program is distributed in the hope that it will be useful, but WITHOUT
1781 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1782 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1783 +# details.
1784 +#
1785 +# You should have received a copy of the GNU Lesser General Public License along
1786 +# with this program. If not, see <http://www.gnu.org/licenses/>.
1787 +
1788 +from functools import partial
1789 +from math import cos, sqrt, pi #Needed for geographical distance
1790 +try:
1791 +    from dateutil import parser as dateparser
1792 +    DATEUTIL_ENABLED = True
1793 +except ImportError:
1794 +    DATEUTIL_ENABLED = False
1795 +from scipy import matrix, empty
1796 +
1797 +from nazca.utils.normalize import tokenize
1798 +
1799 +
1800 +###############################################################################
1801 +### UTILITY FUNCTIONS #########################################################
1802 +###############################################################################
1803 +def cdist(distance_callback, refset, targetset, matrix_normalized=False,
1804 +          ref_indexes=None, target_indexes=None):
1805 +    """ Compute the metric matrix, given two datasets and a metric
1806 +
1807 +    Parameters
1808 +    ----------
1809 +    refset: a dataset (list of records)
1810 +
1811 +    targetset: a dataset (list of records)
1812 +
1813 +    Returns
1814 +    -------
1815 +
1816 +    A distance matrix, of shape (len(refset), len(targetset))
1817 +    with the distance of each element in it.
1818 +    """
1819 +    ref_indexes = ref_indexes or xrange(len(refset))
1820 +    target_indexes = target_indexes or xrange(len(targetset))
1821 +    distmatrix = empty((len(ref_indexes), len(target_indexes)), dtype='float32')
1822 +    size = distmatrix.shape
1823 +    for i, iref in enumerate(ref_indexes):
1824 +        for j, jref in enumerate(target_indexes):
1825 +            d = 1
1826 +            if refset[iref] and targetset[jref]:
1827 +                d = distance_callback(refset[iref], targetset[jref])
1828 +                if matrix_normalized:
1829 +                    d = 1 - (1.0/(1.0 + d))
1830 +            distmatrix[i, j] = d
1831 +    return distmatrix
1832 +
1833 +def _handlespaces(stra, strb, distance, tokenizer=None, **kwargs):
1834 +    """ Compute the matrix of distances between all tokens of stra and strb
1835 +        (with function ``distance``). Extra args are given to the distance
1836 +        function
1837 +
1838 +        The distance returned is defined as the max of the min of each rows of
1839 +        each distance matrix, see the example above :
1840 +
1841 +                 |  Victor |  Hugo                  Victor | Jean | Hugo
1842 +         Victor  |     0   |    5           Victor |  0    |  6   |  5
1843 +          Jean   |     6   |    4           Hugo   |  5    |  4   |  0
1844 +          Hugo   |     5   |    0
1845 +
1846 +                 --> 4                                --> 0
1847 +
1848 +        Return 4
1849 +    """
1850 +
1851 +    if ' ' not in stra:
1852 +        stra += ' '
1853 +    if ' ' not in strb:
1854 +        strb += ' '
1855 +
1856 +    toka = tokenize(stra, tokenizer)
1857 +    tokb = tokenize(strb, tokenizer)
1858 +    # If not same number of tokens, complete the smallest list with empty strings
1859 +    if len(toka) != len(tokb):
1860 +        mint = toka if len(toka)<len(tokb) else tokb
1861 +        maxt = toka if len(toka)>len(tokb) else tokb
1862 +        mint.extend(['' for i in range(len(maxt)-len(mint))])
1863 +
1864 +    listmatrix = []
1865 +    for i in xrange(len(toka)):
1866 +        listmatrix.append([distance(toka[i], tokb[j], **kwargs) for j in xrange(len(tokb))])
1867 +    m = matrix(listmatrix)
1868 +    minlist = [m[i,:].min() for i in xrange(m.shape[0])]
1869 +    minlist.extend([m[:,i].min() for i in xrange(m.shape[1])])
1870 +    return max(minlist)
1871 +
1872 +
1873 +###############################################################################
1874 +### NUMERICAL DISTANCES #######################################################
1875 +###############################################################################
1876 +def euclidean(a, b):
1877 +    """ Simple euclidian distance
1878 +    """
1879 +    try:
1880 +        return abs(a - b)
1881 +    except TypeError:
1882 +        #a and b may be strings
1883 +        return abs(float(a) - float(b))
1884 +
1885 +
1886 +###############################################################################
1887 +### STRING DISTANCES ##########################################################
1888 +###############################################################################
1889 +def levenshtein(stra, strb, tokenizer=None):
1890 +    """ Compute the Levenshtein distance between stra and strb.
1891 +
1892 +    The Levenshtein distance is defined as the minimal cost to transform stra
1893 +    into strb, where 3 operators are allowed :
1894 +        - Replace one character of stra into a character of strb
1895 +        - Add one character of strb into stra
1896 +        - Remove one character of strb
1897 +
1898 +        If spaces are found in stra or strb, this method returns
1899 +            _handlespaces(stra, strb, levenshtein)
1900 +    """
1901 +    if ' ' in stra or ' ' in strb:
1902 +        return _handlespaces(stra, strb, levenshtein, tokenizer)
1903 +
1904 +    lenb = len(strb)
1905 +    onerowago = None
1906 +    thisrow = range(1, lenb + 1) + [0]
1907 +    for x in xrange(len(stra)):
1908 +        onerowago, thisrow = thisrow, [0]*lenb + [x+1]
1909 +        for y in xrange(lenb):
1910 +            delcost = onerowago[y] + 1
1911 +            addcost = thisrow[y - 1] + 1
1912 +            subcost = onerowago[y - 1] + (stra[x] != strb[y])
1913 +            thisrow[y] = min(delcost, addcost, subcost)
1914 +    return thisrow[lenb - 1]
1915 +
1916 +def soundexcode(word, language='french'):
1917 +    """ Return the Soundex code of the word ``word``
1918 +        For more information about soundex code see wiki_
1919 +
1920 +        ``language`` can be 'french' or 'english'
1921 +
1922 +        .:: wiki_ : https://en.wikipedia.org/wiki/Soundex
1923 +
1924 +        If spaces are found in stra or strb, this method returns
1925 +            _handlespaces(stra, strb), soundex, language=language)
1926 +    """
1927 +
1928 +    vowels = 'AEHIOUWY'
1929 +    if language.lower() == 'french' :
1930 +        consonnantscode = {'B': '1', 'P': '1',
1931 +                           'C': '2', 'K': '2', 'Q': '2',
1932 +                           'D': '3', 'T': '3',
1933 +                           'L': '4',
1934 +                           'M': '5', 'N': '5',
1935 +                           'R': '6',
1936 +                           'G': '7', 'J': '7',
1937 +                           'X': '8', 'Z': '8', 'S': '8',
1938 +                           'F': '9', 'V': '9'
1939 +                          }
1940 +    elif language.lower() == 'english':
1941 +        consonnantscode = {'B': '1', 'F': '1', 'P': '1', 'V': '1',
1942 +                           'C': '2', 'G': '2', 'J': '2', 'K': '2',
1943 +                           'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
1944 +                           'D': '3', 'T': '3',
1945 +                           'L': '4',
1946 +                           'M': '5', 'N': '5',
1947 +                           'R': '6'
1948 +                          }
1949 +    else:
1950 +        raise NotImplementedError('Soundex code is not supported (yet ?) for'
1951 +                                  'this language (%s). '
1952 +                                  'Supported languages are french and english' % language)
1953 +    word = word.strip().upper()
1954 +    code = word[0]
1955 +    #After this ``for`` code is
1956 +    # the first letter of ``word`` followed by all the consonnants of word,
1957 +    # where from consecutive consonnants, only the first is kept,
1958 +    # and from two identical consonnants separated by a W or a H, only the first
1959 +    # is kept too.
1960 +    for i in xrange(1, len(word)):
1961 +        if word[i] in vowels:
1962 +            continue
1963 +        if word[i - 1] not in vowels and \
1964 +           consonnantscode[word[i]] == consonnantscode.get(code[-1], ''):
1965 +            continue
1966 +        if i + 2 < len(word) and word[i + 1] in 'WH' and \
1967 +           consonnantscode[word[i]] == consonnantscode.get(word[i + 2], ''):
1968 +            continue
1969 +        code += word[i]
1970 +        if len(code) > 4:
1971 +            break
1972 +
1973 +    #Replace according to the codes
1974 +    code = code[0] + ''.join([consonnantscode[c] for c in code[1:]])
1975 +    ###First four letters, completed by zeros
1976 +    return code[:4] + '0'*(4 - len(code))
1977 +
1978 +def soundex(stra, strb, language='french', tokenizer=None):
1979 +    """ Return the 1/0 distance between the soundex code of stra and strb.
1980 +        0 means they have the same code, 1 they don't
1981 +    """
1982 +    if ' ' in stra or ' ' in strb:
1983 +        return _handlespaces(stra, strb, soundex, tokenizer=tokenizer, language=language)
1984 +
1985 +    return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) \
1986 +             else 1
1987 +
1988 +def jaccard(stra, strb, tokenizer=None):
1989 +    """ Return the jaccard distance between stra and strb, condering the tokens
1990 +        set of stra and strb. If no tokenizer is given, it use if
1991 +        alignement.normalize.tokenize's default one.
1992 +
1993 +        J(A, B) = (A \cap B)/(A \cup B)
1994 +        d(A, B) = 1 - J(A, B)
1995 +    """
1996 +    seta = set(tokenize(stra, tokenizer))
1997 +    setb = set(tokenize(strb, tokenizer))
1998 +    return generic_jaccard(seta, setb)
1999 +
2000 +def generic_jaccard(seta, setb):
2001 +    """ Return the jaccard distance between two sets A and B.
2002 +
2003 +        J(A, B) = (A \cap B)/(A \cup B)
2004 +        d(A, B) = 1 - J(A, B)
2005 +    """
2006 +    return 1.0 - 1.0*len(seta.intersection(setb))/len(seta.union(setb))
2007 +
2008 +
2009 +###############################################################################
2010 +### TEMPORAL DISTANCES ########################################################
2011 +###############################################################################
2012 +if DATEUTIL_ENABLED:
2013 +    class FrenchParserInfo(dateparser.parserinfo):
2014 +        """ Inherit of the dateutil.parser.parserinfo and translate the english
2015 +            dependant variables into french.
2016 +        """
2017 +
2018 +        HMS = [(u'h', u'heure', u'heures'),
2019 +               (u'm', u'minute', u'minutes'),
2020 +                    (u's', u'seconde', u'seconde'),]
2021 +        JUMP = [u' ', u'.', u',', u';', u'-', u'/', u"'",
2022 +               u'a', u'le', u'et', u'er']
2023 +        MONTHS = [(u'Jan', u'Janvier'), (u'Fev', u'Fevrier'),
2024 +                  (u'Mar', u'Mars'), (u'Avr', u'Avril'), (u'Mai', u'Mai'),
2025 +                  (u'Jun', u'Juin'), (u'Jui', u'Juillet'),
2026 +                  (u'Aou', u'Aout'), (u'Sep', u'Septembre'),
2027 +                  (u'Oct', u'Octobre'), (u'Nov', u'Novembre'),
2028 +                  (u'Dec', u'Decembre')]
2029 +        PERTAIN = [u'de']
2030 +        WEEKDAYS = [(u'Lun', u'Lundi'),
2031 +                    (u'Mar', u'Mardi'),
2032 +                    (u'Mer', u'Mercredi'),
2033 +                    (u'Jeu', u'Jeudi'),
2034 +                    (u'Ven', u'Vendredi'),
2035 +                    (u'Sam', u'Samedi'),
2036 +                    (u'Dim', u'Dimanche')]
2037 +
2038 +    def temporal(stra, strb, granularity=u'days', parserinfo=FrenchParserInfo,
2039 +                 dayfirst=True, yearfirst=False):
2040 +        """ Return the distance between two strings (read as dates).
2041 +
2042 +            ``granularity`` can be either ``days`` or ``months`` or ``years``
2043 +            (be careful to the plural form !)
2044 +            ``language`` can be either french or english
2045 +
2046 +            ``dayfirst`` and ``yearfirst`` are used in case of ambiguity, for
2047 +            instance 09/09/09, by default it assumes it's day/month/year
2048 +
2049 +            Neither stra nor strb can have accent. Clean it before.
2050 +        """
2051 +
2052 +        datea = dateparser.parse(stra, parserinfo=parserinfo(dayfirst,
2053 +                                 yearfirst), fuzzy=True)
2054 +        dateb = dateparser.parse(strb, parserinfo=parserinfo(dayfirst,
2055 +                                 yearfirst), fuzzy=True)
2056 +        diff = datea - dateb
2057 +        if granularity.lower() == 'years':
2058 +            return abs(diff.days/365.25)
2059 +        if granularity.lower() == 'months':
2060 +            return abs(diff.days/30.5)
2061 +        return abs(diff.days)
2062 +
2063 +
2064 +###############################################################################
2065 +### GEOGRAPHICAL DISTANCES ####################################################
2066 +###############################################################################
2067 +def geographical(pointa, pointb, in_radians=False, planet_radius=6371009,
2068 +                 units='m'):
2069 +    """ Return the geographical distance between two points.
2070 +
2071 +        Both points must be tuples (latitude, longitude)
2072 +
2073 +        - in_radians is True, if latitude and longitude are in radians, false
2074 +          otherwise
2075 +        - planetRadius is the planet's radius in meters. By default, it's the
2076 +          Earth'one.
2077 +
2078 +        - `units` can be 'm' (meters) or 'km' (kilometers)
2079 +    """
2080 +    pointa = (float(pointa[0]), float(pointa[1]))
2081 +    pointb = (float(pointb[0]), float(pointb[1]))
2082 +
2083 +    difflat = pointa[0] - pointb[0]
2084 +    difflong = pointa[1] - pointb[1]
2085 +    meanlat = (pointa[0] + pointb[0])/2.0
2086 +
2087 +    if not in_radians:
2088 +        difflat *= pi/180.0
2089 +        difflong *= pi/180.0
2090 +        meanlat *= pi/180.0
2091 +
2092 +    coef = 1. if units == 'm' else 0.001
2093 +    return coef*planet_radius*sqrt(difflat**2 + (cos(meanlat)*difflong)**2)
2094 +
2095 +
2096 +###############################################################################
2097 +### BASE PROCESSING ############################################################
2098 +###############################################################################
2099 +class BaseProcessing(object):
2100 +    """ A processing object used to provide an abstraction over the different
2101 +    distance functions, and help building Nazca process. """
2102 +
2103 +    def __init__(self, ref_attr_index=None, target_attr_index=None,
2104 +                 distance_callback=euclidean, weight=1, matrix_normalized=False):
2105 +        """ Initiate the BaseProcessing
2106 +
2107 +        Parameters
2108 +        ----------
2109 +
2110 +        ref_attr_index: index of the attribute of interest in a record
2111 +                        for the reference dataset
2112 +                        (i.e. attribute to be used for key computation)
2113 +
2114 +        target_attr_index: index of the attribute of interest in a record
2115 +                           for the target dataset
2116 +                           (i.e. attribute to be used for key computation)
2117 +
2118 +        distance_callback: distance callback. Default is euclidean distance.
2119 +
2120 +        weight: weight of the processing in a global distance matrix
2121 +
2122 +        matrix_normalized: Boolean. If matrix_normalized is True,
2123 +                           the distance between two points is changed to
2124 +                           a value between 0 (equal) and 1 (totaly different).
2125 +                           To avoid useless computation and scale
2126 +                           problems the following “normalization” is done:
2127 +                                d = 1 - 1/(1 + d(x, y))
2128 +
2129 +        """
2130 +        self.ref_attr_index = ref_attr_index
2131 +        self.target_attr_index = target_attr_index
2132 +        self.distance_callback = distance_callback
2133 +        self.weight = weight
2134 +        self.matrix_normalized = matrix_normalized
2135 +
2136 +    def distance(self, reference_record, target_record):
2137 +        """ Compute the distance between two records
2138 +
2139 +        Parameters
2140 +        ----------
2141 +        reference_record: a record (tuple/list of values) of the reference dataset.
2142 +
2143 +        target_record: a record (tuple/list of values) of the target dataset.
2144 +
2145 +        """
2146 +        refrecord = (reference_record[self.ref_attr_index] if self.ref_attr_index
2147 +                     else reference_record)
2148 +        targetrecord = (target_record[self.target_attr_index] if self.target_attr_index
2149 +                        else target_record)
2150 +        return self.distance_callback(refrecord, targetrecord)
2151 +
2152 +    def cdist(self, refset, targetset, ref_indexes=None, target_indexes=None):
2153 +        """ Compute the metric matrix, given two datasets and a metric
2154 +
2155 +        Parameters
2156 +        ----------
2157 +        refset: a dataset (list of records)
2158 +
2159 +        targetset: a dataset (list of records)
2160 +
2161 +        Returns
2162 +        -------
2163 +
2164 +        A distance matrix, of shape (len(refset), len(targetset))
2165 +        with the distance of each element in it.
2166 +        """
2167 +        return cdist(self.distance, refset, targetset,
2168 +                     matrix_normalized=self.matrix_normalized,
2169 +                     ref_indexes=ref_indexes, target_indexes=target_indexes)
2170 +
2171 +    def pdist(self, dataset):
2172 +        """ Compute the upper triangular matrix in a way similar
2173 +        to scipy.spatial.metric
2174 +
2175 +        Parameters
2176 +        ----------
2177 +        dataset: a dataset (list of records)
2178 +
2179 +        Returns
2180 +        -------
2181 +
2182 +        The values of the upper triangular distance matrix
2183 +        (of shape (len(dataset), len(dataset)) with the distance of each element in it.
2184 +        The values are sorted as row 1, row2, ...
2185 +        """
2186 +        values = []
2187 +        for i in xrange(len(dataset)):
2188 +            for j in xrange(i+1, len(dataset)):
2189 +                d = 1
2190 +                if dataset[i] and dataset[j]:
2191 +                    d = self.distance(dataset[i], dataset[j])
2192 +                    if self.matrix_normalized:
2193 +                        d = 1 - (1.0/(1.0 + d))
2194 +                values.append(d)
2195 +        return values
2196 +
2197 +
2198 +###############################################################################
2199 +### CONCRETE PROCESSINGS #######################################################
2200 +###############################################################################
2201 +class LevenshteinProcessing(BaseProcessing):
2202 +    """ A processing based on the levenshtein distance.
2203 +    """
2204 +
2205 +    def __init__(self, ref_attr_index=None, target_attr_index=None,
2206 +                 tokenizer=None, weight=1, matrix_normalized=False):
2207 +        distance_callback = partial(levenshtein,
2208 +                                    tokenizer=tokenizer)
2209 +        super(LevenshteinProcessing, self).__init__(ref_attr_index,
2210 +                                                   target_attr_index,
2211 +                                                   distance_callback,
2212 +                                                   weight,matrix_normalized)
2213 +
2214 +
2215 +class GeographicalProcessing(BaseProcessing):
2216 +    """ A processing based on the geographical distance.
2217 +    """
2218 +
2219 +    def __init__(self, ref_attr_index=None, target_attr_index=None,
2220 +                 in_radians=False, planet_radius=6371009, units='m', weight=1, matrix_normalized=False):
2221 +        distance_callback = partial(geographical, in_radians=in_radians,
2222 +                                    planet_radius=planet_radius, units=units)
2223 +        super(GeographicalProcessing, self).__init__(ref_attr_index,
2224 +                                                    target_attr_index,
2225 +                                                    distance_callback,
2226 +                                                    weight,matrix_normalized)
diff --git a/utils/minhashing.py b/utils/minhashing.py
@@ -0,0 +1,184 @@
2227 +# -*- coding:utf-8 -*-
2228 +# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
2229 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
2230 +#
2231 +# This program is free software: you can redistribute it and/or modify it under
2232 +# the terms of the GNU Lesser General Public License as published by the Free
2233 +# Software Foundation, either version 2.1 of the License, or (at your option)
2234 +# any later version.
2235 +#
2236 +# This program is distributed in the hope that it will be useful, but WITHOUT
2237 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
2238 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
2239 +# details.
2240 +#
2241 +# You should have received a copy of the GNU Lesser General Public License along
2242 +# with this program. If not, see <http://www.gnu.org/licenses/>.
2243 +
2244 +import cPickle
2245 +
2246 +from random import randint
2247 +from collections import defaultdict
2248 +
2249 +import numpy as np
2250 +from scipy.optimize import bisect
2251 +
2252 +from nazca.utils.normalize import iter_wordgrams
2253 +
2254 +
2255 +def randomhashfunction(zr):
2256 +    """ Return a random hash function, mapping x in Z to ZR
2257 +        h:x -> ax + b mod R
2258 +
2259 +    """
2260 +    bound = max(zr - 1, 1)
2261 +    a = randint(1, bound)
2262 +    b = randint(1, bound)
2263 +
2264 +    def hashfunc(x):
2265 +        return ((a*x + b)%zr)
2266 +
2267 +    return hashfunc
2268 +
2269 +
2270 +class Minlsh(object):
2271 +    """ Operate minhashing + locally-sensitive-hashing to find similar sentences
2272 +    """
2273 +
2274 +    def __init__(self, verbose=False):
2275 +        self._trained = False
2276 +        self.sigmatrix = None
2277 +        self._verbose = verbose
2278 +
2279 +    def train(self, sentences, k=2, siglen=200):
2280 +        """ Train the minlsh on the given sentences.
2281 +
2282 +            - `k` is the length of the k-wordgrams used
2283 +              (the lower k is, the faster is the training)
2284 +            - `siglen` the length of the sentences signature
2285 +
2286 +        """
2287 +
2288 +        rows, shape = self._buildmatrixdocument(sentences, k)
2289 +
2290 +        if self._verbose: print "Training is done. Wait while signaturing"
2291 +
2292 +        self._computesignaturematrix(rows, shape, siglen)
2293 +        self._trained = True
2294 +
2295 +
2296 +    def _buildmatrixdocument(self, sentences, k):
2297 +        """ Return a sparse matrix where :
2298 +
2299 +            - Each sentence is a column
2300 +            - Each row is a element of the universal set
2301 +
2302 +            Each value (r, c) is set to 1 if the element at row r is in the
2303 +            sentence c, 0 otherwise
2304 +
2305 +        """
2306 +
2307 +        rows, universe, sizeofuniverse = [], {}, 0
2308 +        for nb, sent in enumerate(sentences):
2309 +            row = []
2310 +            for w in iter_wordgrams(sent, k):
2311 +                row.append(universe.setdefault(w, sizeofuniverse))
2312 +                if row[-1] == sizeofuniverse:
2313 +                    sizeofuniverse += 1
2314 +            rows.append(row)
2315 +            if self._verbose and nb % 50000 == 0:
2316 +                print nb
2317 +
2318 +        return rows, (len(rows), sizeofuniverse)
2319 +
2320 +    def _computesignaturematrix(self, rows, shape, siglen):
2321 +        """ Return a matrix where each column is the signature the document
2322 +            The signature is composed of `siglen` numbers
2323 +
2324 +            The more the documents have rows in commun, the closer they are.
2325 +        """
2326 +
2327 +        nrows, ncols = shape
2328 +        sig = np.empty((siglen, nrows))
2329 +        #Generate the random hash functions
2330 +        hashfunc = [randomhashfunction(ncols) for _ in xrange(siglen)]
2331 +        #Compute hashing values just for once.
2332 +        #Avoid multiple recomputations for the same column.
2333 +        hashvalues = np.array([[hashfunc[i](r) for r in xrange(ncols)]
2334 +                                for i in  xrange(siglen)])
2335 +
2336 +        docind = 0
2337 +        while rows:
2338 +            doc = rows.pop(0)
2339 +            #Concatenate the needed rows.
2340 +            tmp = np.dstack([hashvalues[:, r] for r in doc])
2341 +            #Take the mininum of hashes
2342 +            sig[:, docind] = np.min(tmp[0], 1)
2343 +            docind += 1
2344 +            if self._verbose and docind % 50000 == 0:
2345 +                print (docind * 100) / nrows
2346 +        self.sigmatrix = sig
2347 +
2348 +    def save(self, savefile):
2349 +        """ Save the training into `savefile` for a future use """
2350 +
2351 +        if not self._trained:
2352 +            print "Not trained, nothing to save"
2353 +            return
2354 +
2355 +        with open(savefile, 'wb') as fobj:
2356 +            pickler = cPickle.Pickler(fobj)
2357 +            pickler.dump(self.sigmatrix)
2358 +
2359 +    def load(self, savefile):
2360 +        """ Load a trained minhashing """
2361 +
2362 +        with open(savefile, 'rb') as fobj:
2363 +            pickler = cPickle.Unpickler(fobj)
2364 +            self.sigmatrix = pickler.load()
2365 +
2366 +        if self.sigmatrix is not None:
2367 +            self._trained = True
2368 +        else:
2369 +            self._trained = False
2370 +
2371 +    def computebandsize(self, threshold, nbrows):
2372 +        """ Compute the bandsize according to the threshold given """
2373 +
2374 +        ### t ~ (1/b)^(1/r), where t is the threshold, b the number of
2375 +        ### bands, and r the number of rows per band. And nbrows (the length
2376 +        ### of the matrix is nbrows = b*r, so t ~ (r/L)^(1/r). So, let's
2377 +        ### find the root of f(x) = (x/L)^(1/r) - t.
2378 +        def f(x):
2379 +            y = pow(x/nbrows, 1. /x) - threshold
2380 +            return y
2381 +
2382 +        ## Solve f(x) = 0, with x having values in [1, nbrows]
2383 +        return int(bisect(f, 1, nbrows))
2384 +
2385 +    def predict(self, threshold):
2386 +        """ Return a set of tuples of *possible* similar sentences
2387 +        """
2388 +        if not self._trained:
2389 +            print "Train it before"
2390 +            return
2391 +
2392 +        if not (0 < threshold <= 1):
2393 +            print "Threshold must be in ]0 ; 1]"
2394 +            return
2395 +
2396 +        sig = self.sigmatrix
2397 +        # Treshold is a percent of similarity
2398 +        # It should be inverted here (0 is closed, 1 is far)
2399 +        threshold = 1 - threshold
2400 +        bandsize = self.computebandsize(threshold, self.sigmatrix.shape[0])
2401 +
2402 +        buckets = defaultdict(set)
2403 +        similars = set()
2404 +        for r in xrange(0, sig.shape[0], bandsize):
2405 +            buckets.clear()
2406 +            for i in xrange(sig.shape[1]):
2407 +                buckets[tuple(sig[r:r+bandsize, i])].add(i)
2408 +            similars.update(set(tuple(v) for v in buckets.itervalues()
2409 +                                         if len(v) > 1))
2410 +        return similars
diff --git a/utils/ner_dataio.py b/utils/ner_dataio.py
@@ -0,0 +1,140 @@
2411 +# -*- coding: utf-8 -*-
2412 +""" IO for Named Entities Recognition.
2413 +"""
2414 +import json
2415 +import urllib
2416 +import lxml.etree as ET
2417 +
2418 +
2419 +###############################################################################
2420 +### SPARQL UTILITIES ##########################################################
2421 +###############################################################################
2422 +def sparql_query(query, endpoint):
2423 +    """ Execute a query on an endpoint:
2424 +
2425 +    sparql_query(query=u'''SELECT ?uri ?type
2426 +                           WHERE{
2427 +                           ?uri rdfs:label "Python"@en .
2428 +                           ?uri rdf:type ?type}''',
2429 +                           endpoint=u'http://dbpedia.org/sparql')
2430 +    """
2431 +    from SPARQLWrapper import SPARQLWrapper, JSON
2432 +    sparql = SPARQLWrapper(endpoint)
2433 +    sparql.setQuery(query)
2434 +    sparql.setReturnFormat(JSON)
2435 +    try:
2436 +        rawresults = sparql.query().convert()
2437 +        labels = rawresults['head']['vars']
2438 +        return rawresults["results"]["bindings"]
2439 +    except:
2440 +        print 'Error in sparql query'
2441 +        return []
2442 +
2443 +
2444 +###############################################################################
2445 +### RQL UTILITIES #############################################################
2446 +###############################################################################
2447 +def get_cw_cnx(endpoint):
2448 +    """ Get a cnx on a CubicWeb database
2449 +    """
2450 +    from cubicweb import dbapi
2451 +    from cubicweb.cwconfig import CubicWebConfiguration
2452 +    from cubicweb.entities import AnyEntity
2453 +    CubicWebConfiguration.load_cwctl_plugins()
2454 +    config = CubicWebConfiguration.config_for(endpoint)
2455 +    sourceinfo = config.sources()['admin']
2456 +    login = sourceinfo['login']
2457 +    password = sourceinfo['password']
2458 +    _, cnx = dbapi.in_memory_repo_cnx(config, login, password=password)
2459 +    req = cnx.request()
2460 +    return req
2461 +
2462 +def rql_appid_query(query, endpoint, _cache_cnx={}, **kwargs):
2463 +    """ Execute a query on an appid endpoint:
2464 +
2465 +    rql_query('Any X WHERE X label "Python"', 'localhost')
2466 +
2467 +    Additional arguments can be passed to be properly substitued
2468 +    in the execute() function.
2469 +    """
2470 +    if endpoint in _cache_cnx:
2471 +        cnx = _cache_cnx[endpoint]
2472 +    else:
2473 +        cnx = get_cw_cnx(endpoint)
2474 +        _cache_cnx[endpoint] = cnx
2475 +    return cnx.execute(query, kwargs)
2476 +
2477 +def rql_url_query(query, endpoint):
2478 +    """ Execute a query on an url endpoint:
2479 +
2480 +    rql_query('Any X WHERE X label "Python"', 'localhost')
2481 +    """
2482 +    url = urllib.basejoin(endpoint, '?rql=%s&vid=jsonexport' % query)
2483 +    return json.loads(urllib.urlopen(url).read())
2484 +
2485 +
2486 +###############################################################################
2487 +### OUTPUT UTILITIES ##########################################################
2488 +###############################################################################
2489 +class AbstractNerdyPrettyPrint(object):
2490 +    """ Pretty print the output of a Nerdy process
2491 +    """
2492 +
2493 +    def pprint_text(self, text, named_entities, **kwargs):
2494 +        newtext = u''
2495 +        indice = 0
2496 +        tindices = dict([(t.start, (uri, t)) for uri, p, t in named_entities])
2497 +        while indice < len(text):
2498 +            if indice in tindices:
2499 +                uri, t = tindices[indice]
2500 +                words = text[t.start:t.end]
2501 +                fragment = self.pprint_entity(uri, words, **kwargs)
2502 +                if not self.is_valid(newtext+fragment+text[t.end:]):
2503 +                    fragment = words
2504 +                newtext += fragment
2505 +                indice = t.end
2506 +            else:
2507 +                newtext += text[indice]
2508 +                indice += 1
2509 +        return newtext
2510 +
2511 +    def pprint_entity(self, uri, word, **kwargs):
2512 +        """ Pretty print an entity """
2513 +        raise NotImplementedError
2514 +
2515 +    def is_valid(self, newtext):
2516 +        """Override to check the validity of the prettified content at each
2517 +        enrichement step"""
2518 +        return True
2519 +
2520 +
2521 +class NerdyHTMLPrettyPrint(AbstractNerdyPrettyPrint):
2522 +    """ Pretty print the output of a Nerdy process
2523 +    """
2524 +
2525 +    def pprint_entity(self, uri, word, **kwargs):
2526 +        """ Pretty print an entity """
2527 +        klass = ' class="%s"' % kwargs['html_class'] if 'html_class' in kwargs else ''
2528 +        return u'<a href="%s"%s>%s</a>' % (uri, klass, word)
2529 +
2530 +
2531 +class NerdyValidXHTMLPrettyPrint(NerdyHTMLPrettyPrint):
2532 +
2533 +    XHTML_DOC_TEMPLATE = '''\
2534 +<?xml version="1.0" encoding="UTF-8" ?>
2535 +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
2536 +<html xmlns="http://www.w3.org/1999/xhtml">
2537 +<head>
2538 +<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
2539 +<title>nerdy</title>
2540 +</head>
2541 +<body><div>%s</div></body>
2542 +</html>'''
2543 +
2544 +    def is_valid(self, html):
2545 +        try:
2546 +            ET.fromstring(self.XHTML_DOC_TEMPLATE % html.encode('utf-8'),
2547 +                          parser=ET.XMLParser(dtd_validation=True))
2548 +        except ET.XMLSyntaxError:
2549 +            return False
2550 +        return True
diff --git a/utils/normalize.py b/utils/normalize.py
@@ -0,0 +1,415 @@
2551 +# -*- coding:utf-8 -*-
2552 +# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
2553 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
2554 +#
2555 +# This program is free software: you can redistribute it and/or modify it under
2556 +# the terms of the GNU Lesser General Public License as published by the Free
2557 +# Software Foundation, either version 2.1 of the License, or (at your option)
2558 +# any later version.
2559 +#
2560 +# This program is distributed in the hope that it will be useful, but WITHOUT
2561 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
2562 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
2563 +# details.
2564 +#
2565 +# You should have received a copy of the GNU Lesser General Public License along
2566 +# with this program. If not, see <http://www.gnu.org/licenses/>.
2567 +
2568 +import re
2569 +from string import punctuation
2570 +from warnings import warn
2571 +from unicodedata import normalize as _uninormalize
2572 +from functools import partial
2573 +
2574 +
2575 +FRENCH_STOPWORDS = set([u'alors', u'au', u'aux', u'aucuns', u'aussi', u'autre', u'avant',
2576 +u'avec', u'avoir', u'bon', u'car', u'ce', u'cela', u'ces', u'ceux', u'chaque',
2577 +u'ci', u'comme', u'comment', u'dans', u'de', u'des', u'du', u'dedans', u'dehors',
2578 +u'depuis', u'deux', u'devrait', u'doit', u'donc', u'dos', u'droite', u'début',
2579 +u'elle', u'elles', u'en', u'encore', u'essai', u'est', u'et', u'eu', u'eux', u'fait',
2580 +u'faites', u'fois', u'font', u'force', u'haut', u'hors', u'ici', u'il', u'ils',
2581 +u'je', u'juste', u'la', u'le', u'les', u'leur', u'lui', u'là', u'ma', u'maintenant',
2582 +u'mais', u'me', u'mes', u'moi', u'moins', u'mon', u'mot', u'même', u'ne',
2583 +u'ni', u'nommés', u'nos',
2584 +u'notre', u'nous', u'nouveaux', u'on', u'ou', u'où', u'par', u'parce', u'parole',
2585 +u'pas', u'personnes', u'peut', u'peu', u'pièce', u'plupart', u'pour',
2586 +u'pourquoi', u'quand', u'que', u'quel', u'quelle', u'quelles', u'quels', u'qui',
2587 +u'sa', u'sans', u'se', u'ses', u'seulement', u'si', u'sien', u'son', u'sont', u'sous',
2588 +u'soyez', u'sujet', u'sur', u'ta', u'tandis', u'tellement', u'te', u'tels', u'tes', u'toi',
2589 +u'ton', u'tous', u'tout', u'trop', u'très', u'tu', u'un', u'une', u'valeur', u'voie',
2590 +u'voient', u'vont', u'vos', u'votre', u'vous', u'vu', u'ça', u'étaient', u'état',
2591 +u'étions', u'été', u'être'])
2592 +
2593 +MANUAL_UNICODE_MAP = {
2594 +    u'\xa1': u'!',    # INVERTED EXCLAMATION MARK
2595 +    u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE
2596 +    u'\u2044': u'/',  # FRACTION SLASH
2597 +    u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE
2598 +    u'\xa9': u'(c)',  # COPYRIGHT SIGN
2599 +    u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
2600 +    u'\xe6': u'ae',   # LATIN SMALL LETTER AE
2601 +    u'\xae': u'(r)',  # REGISTERED SIGN
2602 +    u'\u0153': u'oe', # LATIN SMALL LIGATURE OE
2603 +    u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE
2604 +    u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE
2605 +    u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE
2606 +    u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
2607 +    u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S
2608 +    }
2609 +
2610 +
2611 +###############################################################################
2612 +### NORMALIZE FUNCTIONS #######################################################
2613 +###############################################################################
2614 +def unormalize(ustring, substitute=None):
2615 +    """replace diacritical characters with their corresponding ascii characters
2616 +
2617 +    Convert the unicode string to its long normalized form (unicode character
2618 +    will be transform into several characters) and keep the first one only.
2619 +    The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
2620 +    replace all compatibility characters with their equivalents.
2621 +
2622 +    :type substitute: str
2623 +    :param substitute: replacement character to use if decomposition fails
2624 +
2625 +    :see: Another project about ASCII transliterations of Unicode text
2626 +          http://pypi.python.org/pypi/Unidecode
2627 +    """
2628 +    res = []
2629 +    for letter in ustring[:]:
2630 +        try:
2631 +            replacement = MANUAL_UNICODE_MAP[letter]
2632 +        except KeyError:
2633 +            if isinstance(letter, unicode):
2634 +                replacement = _uninormalize('NFKD', letter)[0]
2635 +            else:
2636 +                replacement = letter
2637 +            if ord(replacement) >= 2 ** 7:
2638 +                if substitute is None:
2639 +                    raise ValueError("can't deal with non-ascii based characters")
2640 +                replacement = substitute
2641 +        res.append(replacement)
2642 +    return u''.join(res)
2643 +
2644 +def lunormalize(sentence, substitute=None):
2645 +    """ Normalize a sentence (ie remove accents, set to lower, etc) """
2646 +    return unormalize(sentence,substitute).lower()
2647 +
2648 +def simplify(sentence, lemmas=None, remove_stopwords=True, stopwords=FRENCH_STOPWORDS):
2649 +    """ Simply the given sentence
2650 +        0) If remove_stopwords, then remove the stop words
2651 +        1) If lemmas are given, the sentence is lemmatized
2652 +        2) Set the sentence to lower case
2653 +        3) Remove punctuation
2654 +    """
2655 +    if not isinstance(sentence, basestring):
2656 +        return sentence
2657 +
2658 +    if lemmas:
2659 +        sentence = lemmatized(sentence, lemmas)
2660 +    sentence = sentence.lower()
2661 +    cleansent = ''.join([s if s not in punctuation
2662 +                           else ' ' for s in sentence]).strip()
2663 +    #comma followed by a space is replaced by two spaces, keep only one
2664 +    cleansent = cleansent.replace('  ', ' ')
2665 +
2666 +    if not remove_stopwords:
2667 +        return cleansent
2668 +    else:
2669 +        return ' '.join([w for w in cleansent.split(' ') if w not in stopwords])
2670 +
2671 +def tokenize(sentence, tokenizer=None, regexp=re.compile(r"[^\s]+")):
2672 +    """ Tokenize a sentence.
2673 +        Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
2674 +        in case of failure, it just split on spaces.
2675 +
2676 +        Anyway, tokenizer must have a ``tokenize()`` method
2677 +    """
2678 +    if tokenizer:
2679 +        return tokenizer().tokenize(sentence)
2680 +    # XXX Unicode, could not use WorkTokenizer.
2681 +    # Instead split on whitespaces
2682 +    chunks = []
2683 +    for chunk in [t for t in regexp.findall(sentence) if t]:
2684 +        # Deals with '
2685 +        if "'" in chunk:
2686 +            schunks = chunk.split("'")
2687 +            chunks.extend([c+"'" for c in schunks[:-1]])
2688 +            chunks.append(schunks[-1])
2689 +        else:
2690 +            chunks.append(chunk)
2691 +    return chunks
2692 +
2693 +def iter_wordgrams(sentence, k):
2694 +    """ Generator of k-wordgrams on the given sentence
2695 +    """
2696 +    words = sentence.split(' ')
2697 +    #XXX Call tokenizer
2698 +    for r in xrange(len(words)):
2699 +        yield ' '.join(words[r:r + k])
2700 +
2701 +def loadlemmas(filename, encoding='utf-8'):
2702 +    """ Return the default lemmas dictionnary
2703 +    """
2704 +    lemmas = {}
2705 +    with open(filename) as fobj:
2706 +        for line in fobj:
2707 +            line = line.decode(encoding).strip().split('\t')
2708 +            if len(line) == 2:
2709 +                lemmas[line[0]] = line[1]
2710 +    return lemmas
2711 +
2712 +def lemmatized(sentence, lemmas, tokenizer=None):
2713 +    """ Return the lemmatized sentence
2714 +    """
2715 +    tokenized_sent = tokenize(sentence, tokenizer)
2716 +    tokenized_sentformated = []
2717 +    for w in tokenized_sent:
2718 +        if w in ".,'" and len(tokenized_sentformated) > 0:
2719 +            tokenized_sentformated[-1] += w
2720 +        elif w not in punctuation:
2721 +            tokenized_sentformated.append(w)
2722 +    return u' '.join([lemmatized_word(w, lemmas) for w in tokenized_sentformated])
2723 +
2724 +def lemmatized_word(word, lemmas):
2725 +    """ Return the lemmatized word
2726 +    """
2727 +    lemma = lemmas.get(word.lower(), word)
2728 +    if '|' in lemma:
2729 +        _words = lemma.split('|')
2730 +        if word.lower() in _words:
2731 +            lemma = word.lower()
2732 +        else:
2733 +            lemma = _words[0]
2734 +    return lemma
2735 +
2736 +def roundstr(number, ndigits=0):
2737 +    """Return an unicode string of ``number`` rounded to a given precision
2738 +        in decimal digits (default 0 digits)
2739 +
2740 +        If ``number`` is not a float, this method casts it to a float. (An
2741 +        exception may be raised if it's not possible)
2742 +    """
2743 +    return format(round(float(number), ndigits), '0.%df' % ndigits)
2744 +
2745 +def rgxformat(string, regexp, output):
2746 +    """ Apply the regexp to the ``string`` and return a formatted string
2747 +    according to ``output``
2748 +
2749 +    eg :
2750 +        format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
2751 +               r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
2752 +               r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
2753 +               u'%(lastname)s, %(firstname)s (%(birthdate)s -'
2754 +               u'%(deathdate)s)')
2755 +
2756 +     would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
2757 +     """
2758 +
2759 +    match = re.match(regexp, string)
2760 +    return output % match.groupdict()
2761 +
2762 +
2763 +###############################################################################
2764 +### NORMALIZER OBJECTS ########################################################
2765 +###############################################################################
2766 +class BaseNormalizer(object):
2767 +    """ A normalizer object used to provide an abstraction over the different
2768 +    normalization functions, and help building Nazca process. """
2769 +
2770 +    def __init__(self, callback, attr_index=None):
2771 +        """ Initiate the BaseNormalizer
2772 +
2773 +        Parameters
2774 +        ----------
2775 +        callback: normalization callback
2776 +
2777 +        attr_index: index of the attribute of interest in a record
2778 +                    (i.e. attribute to be normalized).
2779 +                    By default, 'attr_index' is None and the whole
2780 +                    record is passed to the callback.
2781 +                    If given, only the attr_index value of the record
2782 +                    is passed to the the callback.
2783 +                    Could be a list or an int
2784 +        """
2785 +        self.callback = callback
2786 +        if attr_index:
2787 +            self.attr_index = attr_index if isinstance(attr_index, (tuple, list)) else (attr_index,)
2788 +        else:
2789 +            self.attr_index = attr_index
2790 +
2791 +    def normalize(self, record):
2792 +        """ Normalize a record
2793 +
2794 +        Parameters
2795 +        ----------
2796 +        record: a record (tuple/list of values).
2797 +
2798 +        Returns
2799 +        -------
2800 +
2801 +        record: the normalized record.
2802 +        """
2803 +        if not self.attr_index:
2804 +            return self.callback(record)
2805 +        else:
2806 +            for attr_ind in self.attr_index:
2807 +                record = list(r if ind != attr_ind else self.callback(r)
2808 +                               for ind, r in enumerate(record))
2809 +            return record
2810 +
2811 +    def normalize_dataset(self, dataset, inplace=False):
2812 +        """ Normalize a dataset
2813 +
2814 +        Parameters
2815 +        ----------
2816 +        dataset: a list of record (tuple/list of values).
2817 +
2818 +        inplace: Boolean. If True, normalize the dataset in place.
2819 +
2820 +        Returns
2821 +        -------
2822 +
2823 +        record: the normalized dataset.
2824 +        """
2825 +        if not inplace:
2826 +            dataset = [self.normalize(record) for record in dataset]
2827 +        else:
2828 +            # Change dataset in place
2829 +            for ind, record in enumerate(dataset):
2830 +                dataset[ind] = self.normalize(record)
2831 +        return dataset
2832 +
2833 +
2834 +class UnicodeNormalizer(BaseNormalizer):
2835 +    """ Normalizer that unormalize the unicode
2836 +    (i.e. replace accentuating characters by ASCII ones)
2837 +    """
2838 +    def __init__(self, attr_index=None, substitute=None):
2839 +        callback = partial(lunormalize, substitute=substitute)
2840 +        super(UnicodeNormalizer, self).__init__(callback, attr_index=attr_index)
2841 +
2842 +
2843 +class SimplifyNormalizer(BaseNormalizer):
2844 +    """ Normalizer that simplify a string
2845 +        0) If remove_stopwords, then remove the stop words
2846 +        1) If lemmas are given, the sentence is lemmatized
2847 +        2) Set the sentence to lower case
2848 +        3) Remove punctuation
2849 +    """
2850 +    def __init__(self, attr_index=None, lemmas=None, remove_stopwords=True):
2851 +        callback = partial(simplify, lemmas=lemmas, remove_stopwords=remove_stopwords)
2852 +        super(SimplifyNormalizer, self).__init__(callback, attr_index=attr_index)
2853 +
2854 +
2855 +class TokenizerNormalizer(BaseNormalizer):
2856 +    """ Normalizer that tokenize a string
2857 +        Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
2858 +        in case of failure, it just split on spaces.
2859 +        Anyway, tokenizer must have a ``tokenize()`` method
2860 +    """
2861 +    def __init__(self, attr_index=None, tokenizer=None, regexp=re.compile(r"[^\s]+")):
2862 +        callback = partial(tokenize, tokenizer=tokenizer, regexp=regexp)
2863 +        super(TokenizerNormalizer, self).__init__(callback, attr_index=attr_index)
2864 +
2865 +
2866 +class LemmatizerNormalizer(BaseNormalizer):
2867 +    """ Normalizer that lemmatize a string
2868 +    """
2869 +    def __init__(self, lemmas, attr_index=None, tokenizer=None):
2870 +        callback = partial(lemmatized, lemmas=lemmas, tokenizer=tokenizer)
2871 +        super(LemmatizerNormalizer, self).__init__(callback, attr_index=attr_index)
2872 +
2873 +
2874 +class RoundNormalizer(BaseNormalizer):
2875 +    """Normalizer that round a string
2876 +    Return an unicode string of ``number`` rounded to a given precision
2877 +    in decimal digits (default 0 digits)
2878 +
2879 +    If ``number`` is not a float, this method casts it to a float. (An
2880 +    exception may be raised if it's not possible)
2881 +    """
2882 +    def __init__(self, attr_index=None, ndigits=0):
2883 +        callback = partial(roundstr, ndigits=ndigits)
2884 +        super(RoundNormalizer, self).__init__(callback, attr_index=attr_index)
2885 +
2886 +
2887 +class RegexpNormalizer(BaseNormalizer):
2888 +    """Normalizer that normalize a string based on a regexp
2889 +
2890 +     Apply the regexp to the ``string`` and return a formatted string
2891 +    according to ``output``
2892 +
2893 +    eg :
2894 +        format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
2895 +               r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
2896 +               r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
2897 +               u'%(lastname)s, %(firstname)s (%(birthdate)s -'
2898 +               u'%(deathdate)s)')
2899 +
2900 +     would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'
2901 +    """
2902 +    def __init__(self, regexp, output, attr_index=None):
2903 +        callback = partial(rgxformat, regexp=regexp, output=output)
2904 +        super(RegexpNormalizer, self).__init__(callback, attr_index=attr_index)
2905 +
2906 +
2907 +###############################################################################
2908 +### JOIN NORMALIZER ###########################################################
2909 +###############################################################################
2910 +class JoinNormalizer(BaseNormalizer):
2911 +    """Normalizer that join multiple fields in only one.
2912 +    This new field will be put at the end of the new record.
2913 +    """
2914 +    def __init__(self, attr_indexes, join_car=', '):
2915 +        self.attr_indexes = attr_indexes
2916 +        self.join_car = join_car
2917 +
2918 +    def normalize(self, record):
2919 +        """ Normalize a record
2920 +
2921 +        Parameters
2922 +        ----------
2923 +        record: a record (tuple/list of values).
2924 +
2925 +        Returns
2926 +        -------
2927 +
2928 +        record: the normalized record.
2929 +        """
2930 +        _record = [r for ind, r in enumerate(record) if ind not in self.attr_indexes]
2931 +        _record.append(self.join_car.join([r for ind, r in enumerate(record) if ind in self.attr_indexes]))
2932 +        return _record
2933 +
2934 +
2935 +###############################################################################
2936 +### NORMALIZER PIPELINE #######################################################
2937 +###############################################################################
2938 +class NormalizerPipeline(BaseNormalizer):
2939 +    """ Pipeline of Normalizers
2940 +    """
2941 +
2942 +    def __init__(self, normalizers):
2943 +        """ Initiate the NormalizerPipeline
2944 +
2945 +        Parameters
2946 +        ----------
2947 +        normalizers: list (ordered) of Normalizer
2948 +        """
2949 +        self.normalizers = normalizers
2950 +
2951 +    def normalize(self, record):
2952 +        """ Normalize a record
2953 +
2954 +        Parameters
2955 +        ----------
2956 +        record: a record (tuple/list of values).
2957 +
2958 +        Returns
2959 +        -------
2960 +
2961 +        record: the normalized record.
2962 +        """
2963 +        for normalizer in self.normalizers:
2964 +            record = normalizer.normalize(record)
2965 +        return record