[old api] Remove deprecated old API, closes #197016

authorvincent.michel@logilab.fr
changesete9b7a47e8d3e
branchdefault
phasepublic
hiddenno
parent revision#8978092150e7 [normalize] Remove deprecated "ignorennonascii" in unormalize, closes #187456
child revision#61a56bf04d36 [ner] Remove unused files and move tests, related to #187461
files modified by this revision
record_linkage/old_api.py
test/test_old_api.py
# HG changeset patch
# User vincent.michel@logilab.fr
# Date 1387464107 0
# Thu Dec 19 14:41:47 2013 +0000
# Node ID e9b7a47e8d3e0333c193f24c291f1153b5b59003
# Parent 8978092150e79613bea332a38dc302d39ecdc1c9
[old api] Remove deprecated old API, closes #197016

diff --git a/record_linkage/old_api.py b/record_linkage/old_api.py
@@ -1,432 +0,0 @@
1 -# -*- coding:utf-8 -*-
2 -#
3 -# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
4 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
5 -#
6 -# This program is free software: you can redistribute it and/or modify it under
7 -# the terms of the GNU Lesser General Public License as published by the Free
8 -# Software Foundation, either version 2.1 of the License, or (at your option)
9 -# any later version.
10 -#
11 -# This program is distributed in the hope that it will be useful, but WITHOUT
12 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
14 -# details.
15 -#
16 -# You should have received a copy of the GNU Lesser General Public License along
17 -# with this program. If not, see <http://www.gnu.org/licenses/>.
18 -
19 -from os import listdir
20 -import os.path as osp
21 -from shutil import rmtree
22 -from tempfile import mkdtemp
23 -import sys
24 -import warnings
25 -from functools import partial
26 -
27 -from scipy.sparse import lil_matrix
28 -
29 -from nazca.dataio import write_results, split_file, parsefile
30 -from nazca.normalize import BaseNormalizer, NormalizerPipeline
31 -from nazca.distances import GeographicalProcessing
32 -from nazca.record_linkage.aligner import BaseAligner
33 -from nazca.record_linkage.blocking import KmeansBlocking, KdTreeBlocking, MinHashingBlocking
34 -
35 -
36 -# Backward compatibility. Now, use the BaseAligner inside the functions.
37 -# Perhaps these functions may be removed later...
38 -
39 -
40 -###############################################################################
41 -### NORMALIZE FUNCTIONS #######################################################
42 -###############################################################################
43 -# Backward compatibility. Now, use the NormalizerPipeline inside the functions.
44 -# Perhaps these functions may be removed later...
45 -
46 -def normalize_set(rset, processings):
47 -    """ Apply all the normalization functions to the given rset """
48 -    warnings.warn(DeprecationWarning('This function will be removed '
49 -                                     'in the next release.'
50 -                                     'You should rather use the BaseNormalizer '
51 -                                     'object of the normalize module'))
52 -    normalizers = []
53 -    for ind, processing in processings.iteritems():
54 -        for normalizer in extract_normalization_from_treatment(processing, ind):
55 -            normalizers.append(normalizer)
56 -    # Create pipeline
57 -    pipeline = NormalizerPipeline(normalizers)
58 -    return pipeline.normalize_dataset(rset)
59 -
60 -def extract_normalization_from_treatment(processing, ind):
61 -    """ Extract normalization from processing.
62 -    This function is used for backward compatibility with
63 -    the old function-based API """
64 -    warnings.warn(DeprecationWarning('This function will be removed '
65 -                                     'in the next release.'
66 -                                     'You should rather use the BaseNormalizer '
67 -                                     'object of the normalize module'))
68 -    for f in processing.get('normalization', []):
69 -        farg = f.func_code.co_varnames #List of the arguments of f
70 -        # A kind of union between the arguments needed by f, and the
71 -        # provided ones
72 -        givenargs = dict((arg, processing['norm_params'][arg])
73 -                         for arg in farg if arg in processing.get('norm_params', []))
74 -        callback = f
75 -        if givenargs:
76 -            callback = partial(callback, **givenargs)
77 -        yield BaseNormalizer(callback=callback, attr_index=ind)
78 -
79 -def extract_treatment_from_treatment(processing, ind):
80 -    """ Extract Treatment object from processing dict.
81 -    This is only for backward compatibility with the old API.
82 -    """
83 -    if processing['metric'] == 'geographical':
84 -        return GeographicalProcessing(ind, ind,
85 -                                     matrix_normalized=processing.get('matrix_normalized', False),
86 -                                     **processing.get('metric_params', {}))
87 -
88 -
89 -###############################################################################
90 -### ALIGNER ###################################################################
91 -###############################################################################
92 -def align(alignset, targetset, threshold, processings=None, resultfile=None,
93 -          _applyNormalization=True):
94 -    """ Try to align the items of alignset onto targetset's ones
95 -
96 -        `alignset` and `targetset` are the sets to align. Each set contains
97 -        lists where the first column is the identifier of the item,
98 -        and the others are
99 -        the attributs to align. (Note that the order is important !) Both must
100 -        have the same number of columns.
101 -
102 -        `processings` is a dictionary of dictionaries.
103 -        Each key is the indice of the row, and each value is a dictionary
104 -        that contains the processings to do on the different attributs.
105 -        Each dictionary is built as the following:
106 -
107 -            processing = {'normalization': [f1, f2, f3],
108 -                         'norm_params': {'arg1': arg01, 'arg2': arg02},
109 -                         'metric': d1,
110 -                         'metric_params': {'arg1': arg11},
111 -                         'weighting': w,
112 -                         'matrix_normalize': True
113 -                        }
114 -
115 -            `normalization` is the list of functions called to normalize the
116 -            given attribut (in order). Each functions is called with `norm_params`
117 -            as arguments
118 -
119 -            Idem for `distance` and `distance_args`
120 -
121 -            `weighting` is the weighting for the current attribut in regard to
122 -            the others
123 -
124 -            `resultfile` (default is None). Write the matched elements in a file.
125 -
126 -        Return the distance matrix and the matched list.
127 -    """
128 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
129 -                                     'release.'
130 -                                     ' You should rather use the BaseAligner '
131 -                                     'object of the aligner module'))
132 -    processings = processings or {}
133 -    # Get the normalizers
134 -    normalizers = []
135 -    for ind, processing in processings.iteritems():
136 -        for normalizer in extract_normalization_from_treatment(processing, ind):
137 -            normalizers.append(normalizer)
138 -    # Cleanup processings
139 -    for t in processings.itervalues():
140 -        if 'normalization' in t:
141 -            t.pop('normalization')
142 -        if 'norm_params' in t:
143 -            t.pop('norm_params')
144 -    # Build aligner
145 -    processings = [extract_treatment_from_treatment(t, ind) for ind, t in processings.iteritems()]
146 -    aligner = BaseAligner(threshold, processings)
147 -    aligner.register_ref_normalizer(normalizers)
148 -    aligner.register_target_normalizer(normalizers)
149 -    # Align
150 -    return aligner.align(alignset, targetset)
151 -
152 -def subalign(alignset, targetset, alignind, targetind, threshold,
153 -             processings=None, _applyNormalization=True):
154 -    """ Compute a subalignment for a list of indices of the alignset and
155 -    a list of indices for the targetset """
156 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
157 -                                     'release.'
158 -                                     ' You should rather use the BaseAligner '
159 -                                     'object of the aligner module'))
160 -    mat, matched = align([alignset[i[0]] for i in alignind],
161 -                         [targetset[i[0]] for i in targetind], threshold,
162 -                         processings, _applyNormalization=_applyNormalization)
163 -    new_matched = {}
164 -    for k, values in matched.iteritems():
165 -        new_matched[alignind[k]] = [(targetind[i], d) for i, d in values]
166 -    return mat, new_matched
167 -
168 -def conquer_and_divide_alignment(alignset, targetset, threshold, processings=None,
169 -                                 indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
170 -                                 n_clusters=None, kwordsgram=1, siglen=200,
171 -                                 get_global_mat=True):
172 -    """ Full conquer and divide method for alignment.
173 -    Compute neighbours and merge the different subalignments.
174 -    """
175 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
176 -                                     'release.'
177 -                                     ' You should rather use the BaseAligner '
178 -                                     'object of the aligner module'))
179 -    global_matched = {}
180 -    if get_global_mat:
181 -        global_mat = lil_matrix((len(alignset), len(targetset)))
182 -
183 -    processings = processings or {}
184 -    ralignset = normalize_set(alignset, processings)
185 -    rtargetset = normalize_set(targetset, processings)
186 -
187 -    for alignind, targetind in findneighbours(ralignset, rtargetset, indexes, mode,
188 -                                              neighbours_threshold, n_clusters,
189 -                                              kwordsgram, siglen):
190 -        _, matched = subalign(alignset, targetset, alignind, targetind,
191 -                                threshold, processings, _applyNormalization=False)
192 -        for k, values in matched.iteritems():
193 -            subdict = global_matched.setdefault(k, set())
194 -            for v, d in values:
195 -                subdict.add((v, d))
196 -                # XXX avoid issue in sparse matrix
197 -                if get_global_mat:
198 -                    global_mat[k[0], v[0]] = d or 10**(-10)
199 -    if get_global_mat:
200 -        return global_mat, global_matched
201 -    return global_matched
202 -
203 -def alignall(alignset, targetset, threshold, processings=None,
204 -             indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
205 -             n_clusters=None, kwordsgram=1, siglen=200, uniq=False):
206 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
207 -                                     'release.'
208 -                                     ' You should rather use the BaseAligner '
209 -                                     'object of the aligner module'))
210 -    if not mode:
211 -        _, matched = align(alignset, targetset, threshold, processings,
212 -                           resultfile=None, _applyNormalization=True)
213 -    else:
214 -        matched = conquer_and_divide_alignment(alignset, targetset, threshold,
215 -                                               processings, indexes, mode,
216 -                                               neighbours_threshold, n_clusters,
217 -                                               kwordsgram, siglen,
218 -                                               get_global_mat=False)
219 -
220 -    if not uniq:
221 -        for alignid in matched:
222 -            for targetid, _ in matched[alignid]:
223 -                yield alignset[alignid[0]][0], targetset[targetid[0]][0]
224 -    else:
225 -        for alignid in matched:
226 -            bestid, _ = sorted(matched[alignid], key=lambda x:x[1])[0]
227 -            yield alignset[alignid[0]][0], targetset[bestid[0]][0]
228 -
229 -def alignall_iterative(alignfile, targetfile, alignformat, targetformat,
230 -                       threshold, size=10000, equality_threshold=0.01,
231 -                       processings=None, indexes=(1,1), mode='kdtree',
232 -                       neighbours_threshold=0.1, n_clusters=None, kwordsgram=1,
233 -                       siglen=200, cache=None):
234 -    """ This function helps you to align *huge* files.
235 -        It takes your csv files as arguments and split them into smaller ones
236 -        (files of `size` lines), and runs the alignment on those files.
237 -
238 -        `alignformat` and `targetformat` are keyworded arguments given to the
239 -        nazca.dataio.parsefile function.
240 -
241 -        This function returns its own cache. The cache is quite simply a
242 -        dictionary having align items' id as keys and tuples (target item's id,
243 -        distance) as value. This dictionary can be regiven to this function to
244 -        perform another alignment (with different parameters, or just to be
245 -        sure everything has been caught)
246 -
247 -        If the distance of an alignment is below `equality_threshold`, the
248 -        alignment is considered as perfect, and the corresponding item is
249 -        removed from the alignset (to speed up the computation).
250 -    """
251 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
252 -                                     'release.'
253 -                                     ' You should rather use the BaseAligner '
254 -                                     'object of the aligner module'))
255 -    #Split the huge files into smaller ones
256 -    aligndir = mkdtemp()
257 -    targetdir = mkdtemp()
258 -    alignfiles = split_file(alignfile, aligndir, size)
259 -    targetfiles = split_file(targetfile, targetdir, size)
260 -
261 -    #Compute the number of iterations that must be done to achieve the alignement
262 -    nb_iterations = len(alignfiles) * len(targetfiles)
263 -    current_it = 0
264 -
265 -    cache = cache or {} #Contains the better known alignements
266 -    #Contains the id of perfectly aligned data
267 -    doneids = set(_id for _id, (_, dist) in cache.iteritems()
268 -                          if dist < equality_threshold)
269 -
270 -    try:
271 -        for alignfile in alignfiles:
272 -            alignset = [a for a in parsefile(osp.join(aligndir, alignfile), **alignformat)
273 -                        if a[0] not in doneids]
274 -            for targetfile in targetfiles:
275 -                targetset = parsefile(osp.join(targetdir, targetfile), **targetformat)
276 -                matched = conquer_and_divide_alignment(alignset, targetset,
277 -                                                       threshold,
278 -                                                       processings=processings,
279 -                                                       indexes=indexes,
280 -                                                       mode=mode,
281 -                                                       neighbours_threshold=neighbours_threshold,
282 -                                                       n_clusters=n_clusters,
283 -                                                       kwordsgram=kwordsgram,
284 -                                                       siglen=siglen,
285 -                                                       get_global_mat=False)
286 -                for alignid in matched:
287 -                    bestid, dist = sorted(matched[alignid], key=lambda x:x[1])[0]
288 -                    #Get the better known distance
289 -                    _, current_dist = cache.get(alignset[alignid[0]][0], (None, None))
290 -                    if current_dist is None or current_dist > dist:
291 -                        #If it's better, update the cache
292 -                        cache[alignset[alignid[0]][0]] = (targetset[bestid[0]][0], dist)
293 -                        if dist <= equality_threshold:
294 -                            #If perfect, stop trying to align this one
295 -                            doneids.add(alignset[alignid][0])
296 -
297 -                current_it += 1
298 -                sys.stdout.write('\r%0.2f%%' % (current_it * 100. /
299 -                                                nb_iterations))
300 -                sys.stdout.flush()
301 -                if doneids:
302 -                    alignset = [a for a in alignset if a[0] not in doneids]
303 -                if not alignset: #All items have been aligned
304 -                    #TODO Increment current_it.
305 -                    #The progress of the alignment process is computed with
306 -                    #`current_it`. If all items of `alignset` are aligned, we
307 -                    #stop the alignment process for this `alignset`. If
308 -                    #`current_it` isn’t incremented, the progress shown will be
309 -                    #false.
310 -                    break
311 -
312 -    finally:
313 -        rmtree(aligndir)
314 -        rmtree(targetdir)
315 -
316 -    return cache
317 -
318 -
319 -
320 -
321 -
322 -
323 -
324 -###############################################################################
325 -### CLUSTERING-BASED BLOCKINGS FUNCTIONS ######################################
326 -###############################################################################
327 -# Backward compatibility. Now, use the BlockingObject inside the functions.
328 -# Perhaps these functions may be removed later...
329 -def findneighbours_clustering(alignset, targetset, indexes=(1, 1),
330 -                              mode='kmeans', n_clusters=None):
331 -    """ Find the neigbhours using clustering (kmeans or minibatchkmeans)
332 -    """
333 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
334 -                                     'release.'
335 -                                     ' You should rather use the KmeansBlocking '
336 -                                     'object of the blocking module'))
337 -    if mode == 'kmeans':
338 -        blocking = KmeansBlocking(ref_attr_index=indexes[0],
339 -                                  target_attr_index=indexes[1],
340 -                                  n_clusters=n_clusters)
341 -    elif mode == 'minibatch':
342 -        blocking = MiniBatchKmeansBlocking(ref_attr_index=indexes[0],
343 -                                           target_attr_index=indexes[1],
344 -                                           n_clusters=n_clusters)
345 -    else:
346 -        raise ValueError("Mode should be 'kmeans' or 'minibatch'")
347 -    # Fit blocking object
348 -    blocking.fit(alignset, targetset)
349 -    return list(blocking.iter_blocks())
350 -
351 -def findneighbours_kdtree(alignset, targetset, indexes=(1, 1), threshold=0.1):
352 -    """ Find the neigbhours using kdree
353 -    """
354 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
355 -                                     'release.'
356 -                                     ' You should rather use the KdTreeBlocking '
357 -                                     'object of the blocking module'))
358 -    blocking = KdTreeBlocking(ref_attr_index=indexes[0],
359 -                              target_attr_index=indexes[1],
360 -                              threshold=threshold)
361 -    blocking.fit(alignset, targetset)
362 -    return list(blocking.iter_blocks())
363 -
364 -def findneighbours_minhashing(alignset, targetset, indexes=(1, 1), threshold=0.1,
365 -                              kwordsgram=1, siglen=200):
366 -    """ Find the neigbhours using minhashing
367 -    """
368 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
369 -                                     'release.'
370 -                                     ' You should rather use the '
371 -                                     'MinHashingBlocking '
372 -                                     'object of the blocking module'))
373 -    blocking = MinHashingBlocking(ref_attr_index=indexes[0],
374 -                                  target_attr_index=indexes[1],
375 -                                  threshold=threshold, kwordsgram=kwordsgram,
376 -                                  siglen=siglen)
377 -    blocking.fit(alignset, targetset)
378 -    return list(blocking.iter_blocks())
379 -
380 -def findneighbours(alignset, targetset, indexes=(1, 1), mode='kdtree',
381 -                   neighbours_threshold=0.1, n_clusters=None, kwordsgram=1, siglen=200):
382 -    """ This function helps to find neighbours from items of alignset and
383 -        targetset. “Neighbours” are items that are “not so far”, ie having a
384 -        close label, are located in the same area etc.
385 -
386 -        This function handles two types of neighbouring : text and numeric.
387 -        For text value, you have to use the “minhashing” and for numeric, you
388 -        can choose from “kdtree“, “kdmeans“ and “minibatch”
389 -
390 -        The arguments to give are :
391 -            - `alignset` and `targetset` are the sets where neighbours have to
392 -              be found.
393 -            - `indexes` are the location of items to compare
394 -            - `mode` is the search type to use
395 -            - `neighbours_threshold` is the `mode` neighbours_threshold
396 -
397 -            - `n_clusters` is used for "kmeans" and "minibatch" methods, and it
398 -              is the number of clusters to use.
399 -
400 -            - `kwordsgram` and `siglen` are used for "minhashing". `kwordsgram`
401 -              is the length of wordsgrams to use, and `siglen` is the length of
402 -              the minhashing signature matrix.
403 -
404 -        return a list of lists, built as the following :
405 -            [
406 -                [[indexes_of_alignset_0], [indexes_of_targetset_0]],
407 -                [[indexes_of_alignset_1], [indexes_of_targetset_1]],
408 -                [[indexes_of_alignset_2], [indexes_of_targetset_2]],
409 -                [[indexes_of_alignset_3], [indexes_of_targetset_3]],
410 -                ...
411 -            ]
412 -    """
413 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
414 -                                     'release.'
415 -                                     ' You should rather use the '
416 -                                     'BaseBlocking '
417 -                                     'objects of the blocking module'))
418 -    SEARCHERS = set(['kdtree', 'minhashing', 'kmeans', 'minibatch'])
419 -    mode = mode.lower()
420 -
421 -    if mode not in SEARCHERS:
422 -        raise NotImplementedError('Unknown mode given')
423 -    if mode == 'kdtree':
424 -        return findneighbours_kdtree(alignset, targetset, indexes, neighbours_threshold)
425 -    elif mode == 'minhashing':
426 -        return findneighbours_minhashing(alignset, targetset, indexes, neighbours_threshold,
427 -                                         kwordsgram, siglen)
428 -    elif mode in set(['kmeans', 'minibatch']):
429 -        try:
430 -            return findneighbours_clustering(alignset, targetset, indexes, mode, n_clusters)
431 -        except:
432 -            raise NotImplementedError('Scikit learn does not seem to be installed')
diff --git a/test/test_old_api.py b/test/test_old_api.py
@@ -1,261 +0,0 @@
433 -# -*- coding:utf-8 -*-
434 -#
435 -# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
436 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
437 -#
438 -# This program is free software: you can redistribute it and/or modify it under
439 -# the terms of the GNU Lesser General Public License as published by the Free
440 -# Software Foundation, either version 2.1 of the License, or (at your option)
441 -# any later version.
442 -#
443 -# This program is distributed in the hope that it will be useful, but WITHOUT
444 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
445 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
446 -# details.
447 -#
448 -# You should have received a copy of the GNU Lesser General Public License along
449 -# with this program. If not, see <http://www.gnu.org/licenses/>.
450 -
451 -import unittest2
452 -import random
453 -random.seed(6) ### Make sure tests are repeatable
454 -from os import path
455 -
456 -from nazca.normalize import loadlemmas, simplify
457 -from nazca.record_linkage.old_api import (normalize_set,
458 -                                          findneighbours_clustering,
459 -                                          findneighbours_kdtree,
460 -                                          findneighbours_minhashing,
461 -                                          align, subalign,
462 -                                          conquer_and_divide_alignment,
463 -                                          alignall, alignall_iterative)
464 -
465 -
466 -TESTDIR = path.dirname(__file__)
467 -
468 -
469 -# Backward compatibility. Now, use the BaseAligner inside the functions.
470 -# Perhaps these functions may be removed later...
471 -
472 -
473 -###############################################################################
474 -### NORMALIZE TESTS ###########################################################
475 -###############################################################################
476 -class NormalizerFunctionTestCase(unittest2.TestCase):
477 -    def test_normalize_set(self):
478 -        processings = {1: {'normalization': [simplify,]}}
479 -
480 -        alignlist = [['Label1', u"Un nuage flotta dans le grand ciel bleu."],
481 -                     ['Label2', u"Pour quelle occasion vous êtes-vous apprêtée ?"],
482 -                     ['Label3', u"Je les vis ensemble à plusieurs occasions."],
483 -                     ['Label4', u"Je n'aime pas ce genre de bandes dessinées tristes."],
484 -                     ['Label5', u"Ensemble et à plusieurs occasions, je les vis."],
485 -                    ]
486 -        aligntuple = [tuple(l) for l in alignlist]
487 -
488 -        normalizedlist = normalize_set(alignlist, processings)
489 -        normalizedtuple = normalize_set(aligntuple, processings)
490 -
491 -        self.assertListEqual(normalizedlist, normalizedtuple)
492 -        self.assertListEqual(normalizedlist,
493 -                        [['Label1', u"nuage flotta grand ciel bleu"],
494 -                         ['Label2', u"occasion êtes apprêtée"],
495 -                         ['Label3', u"vis ensemble à plusieurs occasions"],
496 -                         ['Label4', u"n aime genre bandes dessinées tristes"],
497 -                         ['Label5', u"ensemble à plusieurs occasions vis"],
498 -                        ])
499 -
500 -
501 -###############################################################################
502 -### ALIGNER TESTS #############################################################
503 -###############################################################################
504 -class AlignerFunctionsTestCase(unittest2.TestCase):
505 -
506 -    def test_align(self):
507 -        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
508 -                    ['V2', 'label2', (6.2, 49)],
509 -                    ['V3', 'label3', (5.1, 48)],
510 -                    ['V4', 'label4', (5.2, 48.1)],
511 -                    ]
512 -        targetset = [['T1', 'labelt1', (6.17, 48.7)],
513 -                     ['T2', 'labelt2', (5.3, 48.2)],
514 -                     ['T3', 'labelt3', (6.25, 48.91)],
515 -                     ]
516 -        processings = {2: {'metric': 'geographical', 'matrix_normalized':False,
517 -                          'metric_params': {'units': 'km', 'in_radians': False}}}
518 -        mat, matched = align(alignset, targetset, 30, processings)
519 -        true_matched = [(0,0), (0, 2), (1,2), (3,1)]
520 -        for k, values in matched.iteritems():
521 -            for v, distance in values:
522 -                self.assertIn((k,v), true_matched)
523 -
524 -    def test_neighbours_align(self):
525 -        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
526 -                    ['V2', 'label2', (6.2, 49)],
527 -                    ['V3', 'label3', (5.1, 48)],
528 -                    ['V4', 'label4', (5.2, 48.1)],
529 -                    ]
530 -        targetset = [['T1', 'labelt1', (6.17, 48.7)],
531 -                     ['T2', 'labelt2', (5.3, 48.2)],
532 -                     ['T3', 'labelt3', (6.25, 48.91)],
533 -                     ]
534 -        true_matched = set([((0, 'V1'), (0, 'T1')),
535 -                           ((1, 'V2'), (2, 'T3')),
536 -                           ((0, 'V1'), (2, 'T3')),
537 -                           ((3, 'V4'), (1, 'T2'))])
538 -        neighbours = findneighbours_kdtree(alignset, targetset, indexes=(2, 2), threshold=0.3)
539 -        processings = {2: {'metric': 'geographical', 'matrix_normalized':False,
540 -                          'metric_params': {'units': 'km', 'in_radians': False}}}
541 -        predict_matched = set()
542 -        for alignind, targetind in neighbours:
543 -            mat, matched = subalign(alignset, targetset, alignind, targetind, 30, processings)
544 -            for k, values in matched.iteritems():
545 -                for v, distance in values:
546 -                    predict_matched.add((k, v))
547 -        self.assertEqual(true_matched, predict_matched)
548 -
549 -    def test_divide_and_conquer_align(self):
550 -        true_matched = set([((0, 'V1'), (0, 'T1')),
551 -                            ((1, 'V2'), (2, 'T3')),
552 -                            ((0, 'V1'), (2, 'T3')),
553 -                            ((3, 'V4'), (1, 'T2'))])
554 -        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
555 -                    ['V2', 'label2', (6.2, 49)],
556 -                    ['V3', 'label3', (5.1, 48)],
557 -                    ['V4', 'label4', (5.2, 48.1)],
558 -                    ]
559 -        targetset = [['T1', 'labelt1', (6.17, 48.7)],
560 -                     ['T2', 'labelt2', (5.3, 48.2)],
561 -                     ['T3', 'labelt3', (6.25, 48.91)],
562 -                     ]
563 -        processings = {2: {'metric': 'geographical', 'matrix_normalized':False,
564 -                          'metric_params': {'units': 'km', 'in_radians': False}}}
565 -        global_mat, global_matched = conquer_and_divide_alignment(alignset, targetset,
566 -                                                                  threshold=30,
567 -                                                                  processings=processings,
568 -                                                                  indexes=(2,2),
569 -                                                                  neighbours_threshold=0.3)
570 -        predict_matched = set()
571 -        for k, values in global_matched.iteritems():
572 -            for v, distance in values:
573 -                predict_matched.add((k, v))
574 -        self.assertEqual(true_matched, predict_matched)
575 -
576 -    def test_alignall(self):
577 -        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
578 -                    ['V2', 'label2', (6.2, 49)],
579 -                    ['V3', 'label3', (5.1, 48)],
580 -                    ['V4', 'label4', (5.2, 48.1)],
581 -                    ]
582 -        targetset = [['T1', 'labelt1', (6.17, 48.7)],
583 -                     ['T2', 'labelt2', (5.3, 48.2)],
584 -                     ['T3', 'labelt3', (6.25, 48.91)],
585 -                     ]
586 -        all_matched = set([('V1','T1'), ('V1', 'T3'), ('V2','T3'), ('V4','T2')])
587 -        uniq_matched = set([('V2', 'T3'), ('V4', 'T2'), ('V1', 'T1')])
588 -        processings = {2: {'metric': 'geographical', 'matrix_normalized': False,
589 -                          'metric_params': {'units': 'km', 'in_radians': False}}}
590 -
591 -        predict_uniq_matched = set(alignall(alignset, targetset,
592 -                                            threshold=30,
593 -                                            processings=processings,
594 -                                            indexes=(2,2),
595 -                                            neighbours_threshold=0.3,
596 -                                            uniq=True))
597 -        predict_matched = set(alignall(alignset, targetset,
598 -                                       threshold=30,
599 -                                       processings=processings,
600 -                                       indexes=(2,2),
601 -                                       neighbours_threshold=0.3,
602 -                                       uniq=False))
603 -
604 -        self.assertEqual(all_matched, predict_matched)
605 -        self.assertEqual(uniq_matched, predict_uniq_matched)
606 -
607 -    def test_alignall_iterative(self):
608 -        matched = set([('V2', 'T3'), ('V4', 'T2'), ('V1', 'T1')])
609 -        processings = {2: {'metric': 'geographical', 'matrix_normalized': False,
610 -                          'metric_params': {'units': 'km', 'in_radians': False}}}
611 -
612 -        _format={'indexes': [0, 1, (2, 3)]}
613 -        alignements = alignall_iterative(path.join(TESTDIR, 'data',
614 -                                                   'alignfile.csv'),
615 -                                         path.join(TESTDIR, 'data',
616 -                                                   'targetfile.csv'),
617 -                                         _format, _format, threshold=30,
618 -                                         size=2, #very small files ;)
619 -                                         processings=processings,
620 -                                         indexes=(2,2),
621 -                                         neighbours_threshold=0.3)
622 -        predict_matched = set([(a, t) for (a, (t, _)) in
623 -                               alignements.iteritems()])
624 -        self.assertEqual(matched, predict_matched)
625 -
626 -
627 -###############################################################################
628 -### NEIGHBOUR TESTS ###########################################################
629 -###############################################################################
630 -class NeigbhoursFunctionsTest(unittest2.TestCase):
631 -    # For backward compatibility
632 -
633 -    def test_findneighbours_kdtree(self):
634 -        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
635 -                    ['V2', 'label2', (6.2, 49)],
636 -                    ['V3', 'label3', (5.1, 48)],
637 -                    ['V4', 'label4', (5.2, 48.1)],
638 -                    ]
639 -        targetset = [['T1', 'labelt1', (6.2, 48.9)],
640 -                     ['T2', 'labelt2', (5.3, 48.2)],
641 -                     ['T3', 'labelt3', (6.25, 48.91)],
642 -                     ]
643 -        neighbours = findneighbours_kdtree(alignset, targetset, indexes=(2, 2), threshold=0.3)
644 -        self.assertEqual([([(0, 'V1')], [(0, 'T1'), (2, 'T3')]),
645 -                          ([(1, 'V2')], [(0, 'T1'), (2, 'T3')]),
646 -                          ([(2, 'V3')], [(1, 'T2')]),
647 -                          ([(3, 'V4')], [(1, 'T2')])], neighbours)
648 -
649 -    def test_findneighbours_minhashing(self):
650 -        lemmas = loadlemmas(path.join(TESTDIR, 'data', 'french_lemmas.txt'))
651 -        processings = {2: {'normalization': [simplify,], 'norm_params': {'lemmas': lemmas}}}
652 -        alignset = [['V1', 'label1', u"Un nuage flotta dans le grand ciel bleu."],
653 -                    ['V2', 'label2', u"Pour quelle occasion vous êtes-vous apprêtée ?"],
654 -                    ['V3', 'label3', u"Je les vis ensemble à plusieurs occasions."],
655 -                    ['V4', 'label4', u"Je n'aime pas ce genre de bandes dessinées tristes."],
656 -                    ['V5', 'label5', u"Ensemble et à plusieurs occasions, je les vis."],
657 -                    ]
658 -        targetset = [['T1', 'labelt1', u"Des grands nuages noirs flottent dans le ciel."],
659 -                     ['T2', 'labelt2', u"Je les ai vus ensemble à plusieurs occasions."],
660 -                     ['T3', 'labelt3', u"J'aime les bandes dessinées de genre comiques."],
661 -                     ]
662 -        alignset = normalize_set(alignset, processings)
663 -        targetset = normalize_set(targetset, processings)
664 -        neighbours = findneighbours_minhashing(alignset, targetset, indexes=(2, 2), threshold=0.4)
665 -        true_set = [([(0, 'V1')], [(0, 'T1')]), ([(3, 'V4')], [(2, 'T3')]),
666 -                    ([(2, 'V3'), (4, 'V5')], [(1, 'T2')])]
667 -        for align in true_set:
668 -            self.assertIn(align, neighbours)
669 -
670 -    def test_findneighbours_clustering(self):
671 -        alignset = [['V1', 'label1', (6.14194444444, 48.67)],
672 -                    ['V2', 'label2', (6.2, 49)],
673 -                    ['V3', 'label3', (5.1, 48)],
674 -                    ['V4', 'label4', (5.2, 48.1)],
675 -                    ]
676 -        targetset = [['T1', 'labelt1', (6.2, 48.9)],
677 -                     ['T2', 'labelt2', (5.3, 48.2)],
678 -                     ['T3', 'labelt3', (6.25, 48.91)],
679 -                     ]
680 -        try:
681 -            import sklearn as skl
682 -        except ImportError:
683 -            self.skipTest('Scikit learn does not seem to be installed')
684 -        if int(skl.__version__.split('-')[0].split('.')[1])<=11:
685 -            self.skipTest('Scikit learn version is too old - Skipping test')
686 -        neighbours = findneighbours_clustering(alignset, targetset, indexes=(2, 2))
687 -        for neighbour in neighbours:
688 -            self.assertIn(neighbour, [([0, 1], [0, 2]), ([2, 3], [1])])
689 -
690 -
691 -if __name__ == '__main__':
692 -    unittest2.main()
693 -