[nazca] Create a record linkage directory, related to #187461

authorvincent.michel@logilab.fr
changesetfbdea77e94dc
branchdefault
phasedraft
hiddenyes
parent revision#9f4f1b034cbf [test] Fix typo in test
child revision#4a009442b0de [normalize] Remove deprecated "ignorennonascii" in unormalize, closes #187456
files modified by this revision
aligner.py
blocking.py
old_api.py
record_linkage/__init__.py
record_linkage/aligner.py
record_linkage/blocking.py
record_linkage/old_api.py
test/test_alignment.py
test/test_blocking.py
test/test_old_api.py
# HG changeset patch
# User vincent.michel@logilab.fr
# Date 1387464062 0
# Thu Dec 19 14:41:02 2013 +0000
# Node ID fbdea77e94dc08d6f22fa206fe4e0001072127aa
# Parent 9f4f1b034cbf26e08c3dbd378f2d9007b0d725be
[nazca] Create a record linkage directory, related to #187461

diff --git a/aligner.py b/aligner.py
@@ -1,324 +0,0 @@
1 -# -*- coding:utf-8 -*-
2 -# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
3 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
4 -#
5 -# This program is free software: you can redistribute it and/or modify it under
6 -# the terms of the GNU Lesser General Public License as published by the Free
7 -# Software Foundation, either version 2.1 of the License, or (at your option)
8 -# any later version.
9 -#
10 -# This program is distributed in the hope that it will be useful, but WITHOUT
11 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
13 -# details.
14 -#
15 -# You should have received a copy of the GNU Lesser General Public License along
16 -# with this program. If not, see <http://www.gnu.org/licenses/>.
17 -import time
18 -import logging
19 -from collections import defaultdict
20 -
21 -from scipy import zeros
22 -from scipy.sparse import lil_matrix
23 -
24 -from nazca.dataio import parsefile
25 -
26 -
27 -###############################################################################
28 -### UTILITY FUNCTIONS #########################################################
29 -###############################################################################
30 -def iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique=True):
31 -    """ Return the aligned pairs
32 -    """
33 -    if unique:
34 -        for refid in global_matched:
35 -            bestid, _ = sorted(global_matched[refid], key=lambda x:x[1])[0]
36 -            ref_record = refset[refid]
37 -            target_record = targetset[bestid]
38 -            distance = global_mat[refid, bestid] if global_mat is not None else None
39 -            yield (ref_record[0], refid), (target_record[0], bestid), distance
40 -    else:
41 -        for refid in global_matched:
42 -            for targetid, _ in global_matched[refid]:
43 -                ref_record = refset[refid]
44 -                target_record = targetset[targetid]
45 -                distance = global_mat[refid, targetid] if global_mat is not None else None
46 -                yield (ref_record[0], refid), (target_record[0], targetid), distance
47 -
48 -
49 -###############################################################################
50 -### BASE ALIGNER OBJECT #######################################################
51 -###############################################################################
52 -class BaseAligner(object):
53 -
54 -    def __init__(self, threshold, processings, normalize_matrix=False):
55 -        self.threshold = threshold
56 -        self.processings = processings
57 -        self.normalize_matrix = normalize_matrix
58 -        self.ref_normalizer = None
59 -        self.target_normalizer = None
60 -        self.target_normalizer = None
61 -        self.blocking = None
62 -        self.alignments_done = 0
63 -        self.pairs_found = 0
64 -        self.nb_comparisons = 0
65 -        self.nb_blocks = 0
66 -        self.refset_size = None
67 -        self.targetset_size = None
68 -        self.time = None
69 -        self.logger = logging.getLogger('nazca.aligner')
70 -
71 -    def register_ref_normalizer(self, normalizer):
72 -        """ Register normalizers to be applied
73 -        before alignment """
74 -        self.ref_normalizer = normalizer
75 -
76 -    def register_target_normalizer(self, normalizer):
77 -        """ Register normalizers to be applied
78 -        before alignment """
79 -        self.target_normalizer = normalizer
80 -
81 -    def register_blocking(self, blocking):
82 -        self.blocking = blocking
83 -
84 -    def apply_normalization(self, dataset, normalizer):
85 -        if normalizer:
86 -            return normalizer.normalize_dataset(dataset)
87 -        return dataset
88 -
89 -    def compute_distance_matrix(self, refset, targetset,
90 -                                ref_indexes, target_indexes):
91 -        """ Compute and return the global alignment matrix.
92 -        For each `processing` a `Distancematrix` is built, then all the
93 -        matrices are summed with their own weighting and the result is the global
94 -        alignment matrix, which is returned.
95 -        """
96 -        distmatrix = zeros((len(ref_indexes), len(target_indexes)), dtype='float32')
97 -        for processing in self.processings:
98 -            distmatrix += processing.cdist(refset, targetset,
99 -                                          ref_indexes, target_indexes)
100 -        return distmatrix
101 -
102 -    def threshold_matched(self, distmatrix):
103 -        """ Return the matched elements within a dictionnary,
104 -        each key being the indice from X, and the corresponding
105 -        values being a list of couple (indice from Y, distance)
106 -        """
107 -        match = defaultdict(list)
108 -        if self.normalize_matrix:
109 -            distmatrix /= distmatrix.max()
110 -        ind = (distmatrix <= self.threshold).nonzero()
111 -        indrow = ind[0].tolist()
112 -        indcol = ind[1].tolist()
113 -        for (i, j) in zip(indrow, indcol):
114 -            match[i].append((j, distmatrix[i, j]))
115 -        return match
116 -
117 -    def _get_match(self, refset, targetset, ref_indexes=None, target_indexes=None):
118 -        # Build items
119 -        items = []
120 -        ref_indexes = ref_indexes or xrange(len(refset))
121 -        target_indexes = target_indexes or xrange(len(targetset))
122 -        # Apply alignments
123 -        mat = self.compute_distance_matrix(refset, targetset,
124 -                                           ref_indexes=ref_indexes,
125 -                                           target_indexes=target_indexes)
126 -        matched = self.threshold_matched(mat)
127 -        # Reapply matched to global indexes
128 -        new_matched = {}
129 -        for k, values in matched.iteritems():
130 -            new_matched[ref_indexes[k]] = [(target_indexes[i], d) for i, d in values]
131 -        return mat, new_matched
132 -
133 -    def align(self, refset, targetset, get_matrix=True):
134 -        """ Perform the alignment on the referenceset
135 -        and the targetset
136 -        """
137 -        start_time = time.time()
138 -        refset = self.apply_normalization(refset, self.ref_normalizer)
139 -        targetset = self.apply_normalization(targetset, self.target_normalizer)
140 -        self.refset_size = len(refset)
141 -        self.targetset_size = len(targetset)
142 -        # If no blocking
143 -        if not self.blocking:
144 -            return self._get_match(refset, targetset)
145 -        # Blocking == conquer_and_divide
146 -        global_matched = {}
147 -        global_mat = lil_matrix((len(refset), len(targetset)))
148 -        self.blocking.fit(refset, targetset)
149 -        for refblock, targetblock in self.blocking.iter_blocks():
150 -            self.nb_blocks += 1
151 -            ref_index = [r[0] for r in refblock]
152 -            target_index = [r[0] for r in targetblock]
153 -            self.nb_comparisons += len(ref_index)*len(target_index)
154 -            _, matched = self._get_match(refset, targetset, ref_index, target_index)
155 -            for k, values in matched.iteritems():
156 -                subdict = global_matched.setdefault(k, set())
157 -                for v, d in values:
158 -                    subdict.add((v, d))
159 -                    self.alignments_done += 1
160 -                    if get_matrix:
161 -                        # XXX avoid issue in sparse matrix
162 -                        global_mat[k, v] = d or 10**(-10)
163 -        self.time = time.time() - start_time
164 -        return global_mat, global_matched
165 -
166 -    def get_aligned_pairs(self, refset, targetset, unique=True):
167 -        """ Get the pairs of aligned elements
168 -        """
169 -        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
170 -        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
171 -            self.pairs_found += 1
172 -            yield pair
173 -        self.log_infos()
174 -
175 -    def align_from_files(self, reffile, targetfile,
176 -                         ref_indexes=None, target_indexes=None,
177 -                         ref_encoding=None, target_encoding=None,
178 -                         ref_separator='\t', target_separator='\t',
179 -                         get_matrix=True):
180 -        """ Align data from files
181 -
182 -        Parameters
183 -        ----------
184 -
185 -        reffile: name of the reference file
186 -
187 -        targetfile: name of the target file
188 -
189 -        ref_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
190 -                      be used to read the files.
191 -
192 -        target_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
193 -                         be used to read the files.
194 -
195 -        ref_separator: separator of the reference file
196 -
197 -        target_separator: separator of the target file
198 -        """
199 -        refset = parsefile(reffile, indexes=ref_indexes,
200 -                           encoding=ref_encoding, delimiter=ref_separator)
201 -        targetset = parsefile(targetfile, indexes=target_indexes,
202 -                              encoding=target_encoding, delimiter=target_separator)
203 -        return self.align(refset, targetset, get_matrix=get_matrix)
204 -
205 -    def get_aligned_pairs_from_files(self, reffile, targetfile,
206 -                         ref_indexes=None, target_indexes=None,
207 -                         ref_encoding=None, target_encoding=None,
208 -                         ref_separator='\t', target_separator='\t',
209 -                         unique=True):
210 -        """ Get the pairs of aligned elements
211 -        """
212 -        refset = parsefile(reffile, indexes=ref_indexes,
213 -                           encoding=ref_encoding, delimiter=ref_separator)
214 -        targetset = parsefile(targetfile, indexes=target_indexes,
215 -                              encoding=target_encoding, delimiter=target_separator)
216 -        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
217 -        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
218 -            yield pair
219 -
220 -    def log_infos(self):
221 -        """ Display some info on the aligner process
222 -        """
223 -        self.logger.info('Computation time : %s' % self.time)
224 -        self.logger.info('Size reference set : %s' % self.refset_size)
225 -        self.logger.info('Size target set : %s' % self.targetset_size)
226 -        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
227 -        self.logger.info('Alignments done : %s' % self.alignments_done)
228 -        self.logger.info('Pairs found : %s' % self.pairs_found)
229 -        self.logger.info('Ratio reference set/alignments done : %s'
230 -                         % (self.alignments_done/float(self.refset_size)))
231 -        self.logger.info('Ratio target set/alignments done : %s'
232 -                         % (self.alignments_done/float(self.targetset_size)))
233 -        self.logger.info('Ratio reference set/pairs found : %s'
234 -                         % (self.pairs_found/float(self.refset_size)))
235 -        self.logger.info('Ratio target set/pairs found : %s'
236 -                         % (self.pairs_found/float(self.targetset_size)))
237 -        self.logger.info('Maximum comparisons : %s'
238 -                         % (self.refset_size * self.targetset_size))
239 -        self.logger.info('Number of blocks : %s' % self.nb_blocks)
240 -        if self.nb_blocks:
241 -            self.logger.info('Ratio comparisons/block : %s'
242 -                             % (float(self.nb_comparisons)/self.nb_blocks))
243 -        self.logger.info('Blocking reduction : %s'
244 -                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
245 -
246 -
247 -###############################################################################
248 -### PIPELINE ALIGNER OBJECT ##################################################
249 -###############################################################################
250 -class PipelineAligner(object):
251 -    """ This pipeline will perform iterative alignments, removing each time
252 -    the aligned results from the previous aligner.
253 -    """
254 -
255 -    def __init__(self, aligners):
256 -        self.aligners = aligners
257 -        self.pairs = {}
258 -        self.nb_comparisons = 0
259 -        self.nb_blocks = 0
260 -        self.alignments_done = 0
261 -        self.pairs_found = 0
262 -        self.refset_size = None
263 -        self.targetset_size = None
264 -        self.time = None
265 -        self.logger = logging.getLogger('nazca.aligner')
266 -
267 -    def get_aligned_pairs(self, refset, targetset, unique=True):
268 -        """ Get the pairs of aligned elements
269 -        """
270 -        start_time = time.time()
271 -        ref_index = range(len(refset))
272 -        target_index = range(len(targetset))
273 -        self.refset_size = len(refset)
274 -        self.targetset_size = len(targetset)
275 -        global_matched = {}
276 -        global_mat = lil_matrix((len(refset), len(targetset)))
277 -        seen_refset = set()
278 -        # Iteration over aligners
279 -        for ind_aligner, aligner in enumerate(self.aligners):
280 -            # Perform alignment
281 -            _refset = [refset[i] for i in ref_index]
282 -            _targetset = [targetset[i] for i in target_index]
283 -            for pair in aligner.get_aligned_pairs(_refset, _targetset, unique):
284 -                self.pairs_found += 1
285 -                pair = ((pair[0][0], ref_index[pair[0][1]]),
286 -                        (pair[1][0], target_index[pair[1][1]]))
287 -                yield pair
288 -                seen_refset.add(pair[0][1])
289 -            # Store stats
290 -            self.nb_blocks += aligner.nb_blocks
291 -            self.nb_comparisons += aligner.nb_comparisons
292 -            # Update indexes if necessary
293 -            # For now, we remove all the reference set that are already matched
294 -            if ind_aligner < len(self.aligners) - 1:
295 -                # There are other aligners after this one
296 -                ref_index = [i for i in ref_index if i not in seen_refset]
297 -        self.time = time.time() - start_time
298 -        self.log_infos()
299 -
300 -    def log_infos(self):
301 -        """ Display some info on the aligner process
302 -        """
303 -        self.logger.info('Computation time : %s' % self.time)
304 -        self.logger.info('Size reference set : %s' % self.refset_size)
305 -        self.logger.info('Size target set : %s' % self.targetset_size)
306 -        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
307 -        self.logger.info('Alignments done : %s' % self.alignments_done)
308 -        self.logger.info('Pairs found : %s' % self.pairs_found)
309 -        self.logger.info('Ratio reference set/alignments done : %s'
310 -                         % (self.alignments_done/float(self.refset_size)))
311 -        self.logger.info('Ratio target set/alignments done : %s'
312 -                         % (self.alignments_done/float(self.targetset_size)))
313 -        self.logger.info('Ratio reference set/pairs found : %s'
314 -                         % (self.pairs_found/float(self.refset_size)))
315 -        self.logger.info('Ratio target set/pairs found : %s'
316 -                         % (self.pairs_found/float(self.targetset_size)))
317 -        self.logger.info('Maximum comparisons : %s'
318 -                         % (self.refset_size * self.targetset_size))
319 -        self.logger.info('Number of blocks : %s' % self.nb_blocks)
320 -        if self.nb_blocks:
321 -            self.logger.info('Ratio comparisons/block : %s'
322 -                             % (float(self.nb_comparisons)/self.nb_blocks))
323 -        self.logger.info('Blocking reduction : %s'
324 -                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
diff --git a/blocking.py b/blocking.py
@@ -1,666 +0,0 @@
325 -# -*- coding:utf-8 -*-
326 -# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
327 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
328 -#
329 -# This program is free software: you can redistribute it and/or modify it under
330 -# the terms of the GNU Lesser General Public License as published by the Free
331 -# Software Foundation, either version 2.1 of the License, or (at your option)
332 -# any later version.
333 -#
334 -# This program is distributed in the hope that it will be useful, but WITHOUT
335 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
336 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
337 -# details.
338 -#
339 -# You should have received a copy of the GNU Lesser General Public License along
340 -# with this program. If not, see <http://www.gnu.org/licenses/>.
341 -
342 -
343 -""" Blocking techniques.
344 -
345 -This module implements a set of blocking techniques used to split
346 -datasets in smaller subsets that will be aligned in more details.
347 -
348 -Additional information:
349 -
350 -   P. Christen, Data Matching, Data-Centric Systems and Applications,
351 -
352 -
353 -"""
354 -from functools import partial
355 -import warnings
356 -
357 -from scipy.spatial import KDTree
358 -
359 -from nazca.minhashing import Minlsh
360 -from nazca.distances import soundexcode
361 -
362 -
363 -###############################################################################
364 -### GENERAL BLOCKING ##########################################################
365 -###############################################################################
366 -class BaseBlocking(object):
367 -    """ An abstract general blocking object that exposes
368 -    the API that should be common to all blockings object
369 -    """
370 -    def __init__(self, ref_attr_index, target_attr_index):
371 -        """ Build the blocking object
372 -
373 -        Parameters
374 -        ----------
375 -
376 -        ref_attr_index: index of the attribute of interest in a record
377 -                        for the reference dataset
378 -                        (i.e. attribute to be used for key computation)
379 -
380 -        target_attr_index: index of the attribute of interest in a record
381 -                           for the target dataset
382 -                           (i.e. attribute to be used for key computation)
383 -        """
384 -        self.ref_attr_index = ref_attr_index
385 -        self.target_attr_index = target_attr_index
386 -        self.refids = None
387 -        self.targetids = None
388 -        self.is_fitted = False
389 -
390 -    def _fit(self, refset, targetset):
391 -        raise NotImplementedError
392 -
393 -    def _iter_blocks(self):
394 -        """ Internal iteration function over blocks
395 -        """
396 -        raise NotImplementedError
397 -
398 -    def _cleanup(self):
399 -        """ Internal cleanup blocking for further use (e.g. in pipeline)
400 -        """
401 -        raise NotImplementedError
402 -
403 -    def fit(self, refset, targetset):
404 -        """ Fit the blocking technique on the reference and target datasets
405 -
406 -        Parameters
407 -        ----------
408 -        refset: a dataset (list of records)
409 -
410 -        targetset: a dataset (list of records)
411 -        """
412 -        self._fit(refset, targetset)
413 -        # Keep ids for blocks building
414 -        self.refids = [(i, r[0]) for i, r in enumerate(refset)]
415 -        self.targetids = [(i, r[0]) for i, r in enumerate(targetset)]
416 -        self.is_fitted = True
417 -
418 -    def iter_blocks(self):
419 -        """ Iterator over the different possible blocks.
420 -
421 -        Returns
422 -        -------
423 -
424 -        (block1, block2): The blocks are always (reference_block, target_block)
425 -                          and contains the pair (index, id) of the record in the
426 -                          corresponding dataset.
427 -        """
428 -        assert self.is_fitted
429 -        return self._iter_blocks()
430 -
431 -    def iter_indice_blocks(self):
432 -        """ Iterator over the different possible blocks.
433 -
434 -        Returns
435 -        -------
436 -
437 -        (block1, block2): The blocks are always (reference_block, target_block)
438 -                          and contains the indexes of the record in the
439 -                          corresponding dataset.
440 -        """
441 -        assert self.is_fitted
442 -        for block1, block2 in self._iter_blocks():
443 -            yield [r[0] for r in block1], [r[0] for r in block2]
444 -
445 -    def iter_id_blocks(self):
446 -        """ Iterator over the different possible blocks.
447 -
448 -        Returns
449 -        -------
450 -
451 -        (block1, block2): The blocks are always (reference_block, target_block)
452 -                          and contains the ids of the record in the
453 -                          corresponding dataset.
454 -        """
455 -        assert self.is_fitted
456 -        for block1, block2 in self._iter_blocks():
457 -            yield [r[1] for r in block1], [r[1] for r in block2]
458 -
459 -    def iter_pairs(self):
460 -        """ Iterator over the different possible pairs.
461 -
462 -        Returns
463 -        -------
464 -
465 -        (pair1, pari2): The pairs are always ((ind_reference, id_reference),
466 -                                              (ind_target, id_target))
467 -                        and are the ids of the record in the corresponding dataset.
468 -        """
469 -        assert self.is_fitted
470 -        for block1, block2 in self.iter_blocks():
471 -            for val1 in block1:
472 -                for val2 in block2:
473 -                    yield val1, val2
474 -
475 -    def iter_indice_pairs(self):
476 -        """ Iterator over the different possible pairs.
477 -
478 -        Returns
479 -        -------
480 -
481 -        (pair1, pari2): The pairs are always (ind_reference, ind_target)
482 -                        and are the ids of the record in the corresponding dataset.
483 -        """
484 -        assert self.is_fitted
485 -        for block1, block2 in self.iter_indice_blocks():
486 -            for val1 in block1:
487 -                for val2 in block2:
488 -                    yield val1, val2
489 -
490 -    def iter_id_pairs(self):
491 -        """ Iterator over the different possible pairs.
492 -
493 -        Returns
494 -        -------
495 -
496 -        (pair1, pari2): The pairs are always (id_reference, id_target)
497 -                        and are the ids of the record in the corresponding dataset.
498 -        """
499 -        assert self.is_fitted
500 -        for block1, block2 in self.iter_id_blocks():
501 -            for val1 in block1:
502 -                for val2 in block2:
503 -                    yield val1, val2
504 -
505 -    def cleanup(self):
506 -        """ Cleanup blocking for further use (e.g. in pipeline)
507 -        """
508 -        self.is_fitted = True
509 -        self._cleanup()
510 -
511 -
512 -###############################################################################
513 -### KEY BLOCKING ##############################################################
514 -###############################################################################
515 -class KeyBlocking(BaseBlocking):
516 -    """ This blocking technique is based on a a blocking criteria
517 -    (or blocking key), that will be used to divide the datasets.
518 -
519 -    The main idea here is:
520 -
521 -    1 - to create an index of f(x) for each x in the reference set.
522 -
523 -    2 - to create an index of f(y) for each y in the target set.
524 -
525 -    3 - to iterate on each distinct value of f(x) and to return
526 -        the identifiers of the records of the both sets for this value.
527 -    """
528 -
529 -    def __init__(self, ref_attr_index, target_attr_index, callback, ignore_none=False):
530 -        super(KeyBlocking, self).__init__(ref_attr_index, target_attr_index)
531 -        self.callback = callback
532 -        self.ignore_none = ignore_none
533 -        self.reference_index = {}
534 -        self.target_index = {}
535 -
536 -    def _fit(self, refset, targetset):
537 -        """ Fit a dataset in an index using the callback
538 -        """
539 -        for ind, rec in enumerate(refset):
540 -            key = self.callback(rec[self.ref_attr_index])
541 -            if not key and self.ignore_none:
542 -                continue
543 -            self.reference_index.setdefault(key, []).append((ind, rec[0]))
544 -        for ind, rec in enumerate(targetset):
545 -            key = self.callback(rec[self.target_attr_index])
546 -            if not key and self.ignore_none:
547 -                continue
548 -            self.target_index.setdefault(key, []).append((ind, rec[0]))
549 -
550 -    def _iter_blocks(self):
551 -        """ Iterator over the different possible blocks.
552 -
553 -        Returns
554 -        -------
555 -
556 -        (block1, block2): The blocks are always (reference_block, target_block)
557 -                          and containts the indexes of the record in the
558 -                          corresponding dataset.
559 -        """
560 -        for key, block1 in self.reference_index.iteritems():
561 -            block2 = self.target_index.get(key)
562 -            if block1 and block2:
563 -                yield (block1, block2)
564 -
565 -    def _cleanup(self):
566 -        """ Cleanup blocking for further use (e.g. in pipeline)
567 -        """
568 -        self.reference_index = {}
569 -        self.target_index = {}
570 -
571 -
572 -class SoundexBlocking(KeyBlocking):
573 -
574 -    def __init__(self, ref_attr_index, target_attr_index, language='french',):
575 -        super(SoundexBlocking, self).__init__(ref_attr_index, target_attr_index,
576 -                                              partial(soundexcode, language=language))
577 -
578 -
579 -###############################################################################
580 -### BIGRAM BLOCKING ###########################################################
581 -###############################################################################
582 -class NGramBlocking(BaseBlocking):
583 -    """ This blocking technique is based on a a n-gram key.
584 -    """
585 -
586 -    def __init__(self, ref_attr_index, target_attr_index, ngram_size=2, depth=2):
587 -        super(NGramBlocking, self).__init__(ref_attr_index, target_attr_index)
588 -        self.ngram_size = ngram_size
589 -        self.depth = depth
590 -        self.reference_index = {}
591 -        self.target_index = {}
592 -
593 -    def _fit_dataset(self, dataset, cur_index, attr_index):
594 -        """ Fit a dataset
595 -        """
596 -        for ind, r in enumerate(dataset):
597 -            cur_dict = cur_index
598 -            text = r[attr_index]
599 -            for i in range(self.depth):
600 -                ngram = text[i*self.ngram_size:(i+1)*self.ngram_size]
601 -                if i < self.depth - 1:
602 -                    cur_dict = cur_dict.setdefault(ngram, {})
603 -            cur_dict.setdefault(ngram, []).append((ind, r[0]))
604 -
605 -    def _fit(self, refset, targetset):
606 -        """ Fit the two sets (reference set and target set)
607 -        """
608 -        self._fit_dataset(refset, self.reference_index, self.ref_attr_index)
609 -        self._fit_dataset(targetset, self.target_index, self.target_attr_index)
610 -
611 -    def _iter_dict(self, ref_cur_dict, target_cur_dict):
612 -        """ Iterative function used to create blocks from dicts
613 -        """
614 -        for key, sub_dict in ref_cur_dict.iteritems():
615 -            if key in target_cur_dict:
616 -                if isinstance(sub_dict, dict):
617 -                    # There is another dict layer
618 -                    for block1, block2 in self._iter_dict(sub_dict, target_cur_dict[key]):
619 -                        yield block1, block2
620 -                else:
621 -                    # This is a list
622 -                    yield sub_dict, target_cur_dict[key]
623 -
624 -    def _iter_blocks(self):
625 -        """ Iterator over the different possible blocks.
626 -
627 -        Returns
628 -        -------
629 -
630 -        (block1, block2): The blocks are always (reference_block, target_block)
631 -                          and containts the indexes of the record in the
632 -                          corresponding dataset.
633 -        """
634 -        for block1, block2 in self._iter_dict(self.reference_index, self.target_index):
635 -            if block1 and block2:
636 -                yield block1, block2
637 -
638 -    def _cleanup(self):
639 -        """ Cleanup blocking for further use (e.g. in pipeline)
640 -        """
641 -        self.reference_index = {}
642 -        self.target_index = {}
643 -
644 -
645 -###############################################################################
646 -### SORTKEY BLOCKING ##########################################################
647 -###############################################################################
648 -class SortedNeighborhoodBlocking(BaseBlocking):
649 -    """ This blocking technique is based on a a sorting blocking criteria
650 -    (or blocking key), that will be used to divide the datasets.
651 -    """
652 -
653 -    def __init__(self, ref_attr_index, target_attr_index, key_func=lambda x: x, window_width=20):
654 -        super(SortedNeighborhoodBlocking, self).__init__(ref_attr_index, target_attr_index)
655 -        self.key_func = key_func
656 -        self.window_width = window_width
657 -        self.sorted_dataset = None
658 -
659 -    def _fit(self, refset, targetset):
660 -        """ Fit a dataset in an index using the callback
661 -        """
662 -        self.sorted_dataset = [((ind, r[0]), r[self.ref_attr_index], 0)
663 -                               for ind, r in enumerate(refset)]
664 -        self.sorted_dataset.extend([((ind, r[0]), r[self.target_attr_index], 1)
665 -                                    for ind, r in enumerate(targetset)])
666 -        self.sorted_dataset.sort(key=lambda x: self.key_func(x[1]))
667 -
668 -    def _iter_blocks(self):
669 -        """ Iterator over the different possible blocks.
670 -        """
671 -        for ind, (rid, record, dset) in enumerate(self.sorted_dataset):
672 -            # Only keep reference set record
673 -            if dset == 1:
674 -                continue
675 -            block1 = [rid,]
676 -            minind = (ind - self.window_width)
677 -            minind = minind if minind >=0 else 0
678 -            maxind = (ind + self.window_width + 1)
679 -            block2 = [ri for ri, re, d in self.sorted_dataset[minind:maxind]
680 -                      if d == 1]
681 -            if block1 and block2:
682 -                yield (block1, block2)
683 -
684 -    def _cleanup(self):
685 -        """ Cleanup blocking for further use (e.g. in pipeline)
686 -        """
687 -        self.sorted_dataset = None
688 -
689 -
690 -###############################################################################
691 -### MERGE BLOCKING ############################################################
692 -###############################################################################
693 -class MergeBlocking(BaseBlocking):
694 -    """ This blocking technique keep only one appearance of one given values,
695 -    and removes all the other records having this value.
696 -    The merge is based on a score function
697 -
698 -    E.g.
699 -      ('http://fr.wikipedia.org/wiki/Paris_%28Texas%29', 'Paris', 25898)
700 -      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
701 -
702 -    could be (with a score function based on the population (third value):
703 -
704 -      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
705 -
706 -    !!! WARNING !!! This is only done on ONE set (the one with a non null attr index)
707 -    """
708 -
709 -    def __init__(self, ref_attr_index, target_attr_index, score_func):
710 -        super(MergeBlocking, self).__init__(ref_attr_index, target_attr_index)
711 -        self.score_func = score_func
712 -        self.merged_dataset = None
713 -        self.other_dataset = None
714 -        if ref_attr_index is None and target_attr_index is None:
715 -            raise ValueError('At least one of ref_attr_index or target_attr_index '
716 -                             'should not be None')
717 -
718 -    def _fit(self, refset, targetset):
719 -        """ Fit a dataset in an index using the callback
720 -        """
721 -        if self.ref_attr_index is not None:
722 -            # Merge refset
723 -            self.merged_dataset = self._merge_dataset(refset, self.ref_attr_index)
724 -            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(targetset)]
725 -        else:
726 -            # Merge targetset
727 -            self.merged_dataset = self._merge_dataset(targetset, self.target_attr_index)
728 -            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(refset)]
729 -
730 -    def _merge_dataset(self, dataset, attr_index):
731 -        """ Merge a dataset
732 -        """
733 -        merged_dataset_dict = {}
734 -        for ind, record in enumerate(dataset):
735 -            score = self.score_func(record)
736 -            if record[attr_index] not in merged_dataset_dict:
737 -                # Create new entry
738 -                merged_dataset_dict[record[attr_index]] = (ind, record, score)
739 -            elif (record[attr_index] in merged_dataset_dict
740 -                  and merged_dataset_dict[record[attr_index]][2] < score):
741 -                # Change current score
742 -                merged_dataset_dict[record[attr_index]] = (ind, record, score)
743 -        return [(ind, r[0]) for ind, r, score in merged_dataset_dict.itervalues()]
744 -
745 -    def _iter_blocks(self):
746 -        """ Iterator over the different possible blocks.
747 -        """
748 -        if self.ref_attr_index is not None:
749 -            yield self.merged_dataset, self.other_dataset
750 -        else:
751 -            # self.target_attr_index is not None
752 -            yield self.other_dataset, self.merged_dataset
753 -
754 -    def _cleanup(self):
755 -        """ Cleanup blocking for further use (e.g. in pipeline)
756 -        """
757 -        self.merged_dataset = None
758 -        self.other_dataset = None
759 -
760 -
761 -###############################################################################
762 -### CLUSTERING-BASED BLOCKINGS ################################################
763 -###############################################################################
764 -class KmeansBlocking(BaseBlocking):
765 -    """ A blocking technique based on Kmeans
766 -    """
767 -
768 -    def __init__(self, ref_attr_index, target_attr_index, n_clusters=None):
769 -        super(KmeansBlocking, self).__init__(ref_attr_index, target_attr_index)
770 -        self.n_clusters = n_clusters
771 -        self.kmeans = None
772 -        self.predicted = None
773 -        from sklearn import cluster
774 -        self.cluster_class = cluster.KMeans
775 -
776 -    def _fit(self, refset, targetset):
777 -        """ Fit the reference dataset.
778 -        """
779 -        # If an element is None (missing), use instead the identity element.
780 -        # The identity element is defined as the 0-vector
781 -        idelement = tuple([0 for _ in xrange(len(refset[0][self.ref_attr_index]))])
782 -        # We assume here that there are at least 2 elements in the refset
783 -        n_clusters = self.n_clusters or (len(refset)/10 or len(refset)/2)
784 -        kmeans =  self.cluster_class(n_clusters=n_clusters)
785 -        kmeans.fit([elt[self.ref_attr_index] or idelement for elt in refset])
786 -        self.kmeans = kmeans
787 -        # Predict on targetset
788 -        self.predicted = self.kmeans.predict([elt[self.target_attr_index]
789 -                                              or idelement for elt in targetset])
790 -
791 -    def _iter_blocks(self):
792 -        """ Iterator over the different possible blocks.
793 -
794 -        Returns
795 -        -------
796 -
797 -        (block1, block2): The blocks are always (reference_block, target_block)
798 -                          and containts the indexes of the record in the
799 -                          corresponding dataset.
800 -        """
801 -        neighbours = [[[], []] for _ in xrange(self.kmeans.n_clusters)]
802 -        for ind, li in enumerate(self.predicted):
803 -            neighbours[li][1].append(self.targetids[ind])
804 -        for ind, li in enumerate(self.kmeans.labels_):
805 -            neighbours[li][0].append(self.refids[ind])
806 -        for block1, block2 in neighbours:
807 -            if len(block1) and len(block2):
808 -                yield block1, block2
809 -
810 -    def _cleanup(self):
811 -        """ Cleanup blocking for further use (e.g. in pipeline)
812 -        """
813 -        self.kmeans = None
814 -        self.predicted = None
815 -
816 -
817 -###############################################################################
818 -### KDTREE BLOCKINGS ##########################################################
819 -###############################################################################
820 -class KdTreeBlocking(BaseBlocking):
821 -    """ A blocking technique based on KdTree
822 -    """
823 -    def __init__(self, ref_attr_index, target_attr_index, threshold=0.1):
824 -        super(KdTreeBlocking, self).__init__(ref_attr_index, target_attr_index)
825 -        self.threshold = threshold
826 -        self.reftree = None
827 -        self.targettree = None
828 -        self.nb_elements = None
829 -
830 -    def _fit(self, refset, targetset):
831 -        """ Fit the blocking
832 -        """
833 -        firstelement = refset[0][self.ref_attr_index]
834 -        self.nb_elements = len(refset)
835 -        idsize = len(firstelement) if isinstance(firstelement, (tuple, list)) else 1
836 -        idelement = (0,) * idsize
837 -        # KDTree is expecting a two-dimensional array
838 -        if idsize == 1:
839 -            self.reftree  = KDTree([(elt[self.ref_attr_index],) or idelement for elt in refset])
840 -            self.targettree = KDTree([(elt[self.target_attr_index],) or idelement for elt in targetset])
841 -        else:
842 -            self.reftree = KDTree([elt[self.ref_attr_index] or idelement for elt in refset])
843 -            self.targettree = KDTree([elt[self.target_attr_index] or idelement for elt in targetset])
844 -
845 -    def _iter_blocks(self):
846 -        """ Iterator over the different possible blocks.
847 -
848 -        Returns
849 -        -------
850 -
851 -        (block1, block2): The blocks are always (reference_block, target_block)
852 -                          and containts the indexes of the record in the
853 -                          corresponding dataset.
854 -        """
855 -        extraneighbours = self.reftree.query_ball_tree(self.targettree, self.threshold)
856 -        neighbours = []
857 -        for ind in xrange(self.nb_elements):
858 -            if not extraneighbours[ind]:
859 -                continue
860 -            _ref = [self.refids[ind],]
861 -            _target = [self.targetids[v] for v in extraneighbours[ind]]
862 -            neighbours.append((_ref, _target))
863 -        for block1, block2 in neighbours:
864 -            if len(block1) and len(block2):
865 -                yield block1, block2
866 -
867 -    def _cleanup(self):
868 -        """ Cleanup blocking for further use (e.g. in pipeline)
869 -        """
870 -        self.reftree = None
871 -        self.targettree = None
872 -        self.nb_elements = None
873 -
874 -
875 -###############################################################################
876 -### MINHASHING BLOCKINGS ######################################################
877 -###############################################################################
878 -class MinHashingBlocking(BaseBlocking):
879 -    """ A blocking technique based on MinHashing
880 -    """
881 -    def __init__(self, ref_attr_index, target_attr_index,
882 -                 threshold=0.1, kwordsgram=1, siglen=200):
883 -        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
884 -        self.threshold = threshold
885 -        self.kwordsgram = kwordsgram
886 -        self.siglen = siglen
887 -        self.minhasher = Minlsh()
888 -        self.nb_elements = None
889 -
890 -    def _fit(self, refset, targetset):
891 -        """ Find the blocking using minhashing
892 -        """
893 -        # If an element is None (missing), use instead the identity element.
894 -        idelement = ''
895 -        self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
896 -                        [elt[self.target_attr_index] or idelement for elt in targetset],
897 -                        self.kwordsgram, self.siglen)
898 -        self.nb_elements = len(refset)
899 -
900 -    def _iter_blocks(self):
901 -        """ Iterator over the different possible blocks.
902 -
903 -        Returns
904 -        -------
905 -
906 -        (block1, block2): The blocks are always (reference_block, target_block)
907 -                          and containts the indexes of the record in the
908 -                          corresponding dataset.
909 -        """
910 -        rawneighbours = self.minhasher.predict(self.threshold)
911 -        neighbours = []
912 -        for data in rawneighbours:
913 -            neighbours.append([[], []])
914 -            for i in data:
915 -                if i >= self.nb_elements:
916 -                    neighbours[-1][1].append(self.targetids[i - self.nb_elements])
917 -                else:
918 -                    neighbours[-1][0].append(self.refids[i])
919 -            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
920 -                neighbours.pop()
921 -        for block1, block2 in neighbours:
922 -            if len(block1) and len(block2):
923 -                yield block1, block2
924 -
925 -    def _cleanup(self):
926 -        """ Cleanup blocking for further use (e.g. in pipeline)
927 -        """
928 -        self.minhasher = Minlsh()
929 -        self.nb_elements = None
930 -
931 -
932 -###############################################################################
933 -### BLOCKING PIPELINE #########################################################
934 -###############################################################################
935 -class PipelineBlocking(BaseBlocking):
936 -    """ Pipeline multiple blocking techniques
937 -    """
938 -
939 -    def __init__(self, blockings, collect_stats=False):
940 -        """ Build the blocking object
941 -
942 -        Parameters
943 -        ----------
944 -
945 -        blockings: ordered list of blocking objects
946 -        """
947 -        self.blockings = blockings
948 -        self.stored_blocks = []
949 -        self.collect_stats = collect_stats
950 -        self.stats = {}
951 -
952 -    def _fit(self, refset, targetset):
953 -        """ Internal fit of the pipeline """
954 -        self._recursive_fit(refset, targetset, range(len(refset)), range(len(targetset)), 0)
955 -
956 -    def _recursive_fit(self, refset, targetset, ref_index, target_index, ind):
957 -        """ Recursive fit of the blockings.
958 -        Blocks are stored in the stored_blocks attribute.
959 -        """
960 -        if ind < len(self.blockings) - 1:
961 -            # There are other blockings after this one
962 -            blocking = self.blockings[ind]
963 -            blocking.cleanup()
964 -            blocking.fit([refset[i] for i in ref_index],
965 -                         [targetset[i] for i in target_index])
966 -            for block1, block2 in blocking.iter_indice_blocks():
967 -                ind_block1 = [ref_index[i] for i in block1]
968 -                ind_block2 = [target_index[i] for i in block2]
969 -                if self.collect_stats:
970 -                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
971 -                self._recursive_fit(refset, targetset, ind_block1, ind_block2, ind+1)
972 -        else:
973 -            # This is the final blocking
974 -            blocking = self.blockings[ind]
975 -            blocking.cleanup()
976 -            blocking.fit([refset[i] for i in ref_index],
977 -                         [targetset[i] for i in target_index])
978 -            for block1, block2 in blocking.iter_blocks():
979 -                ind_block1 = [(ref_index[i], _id) for i, _id in block1]
980 -                ind_block2 = [(target_index[i], _id) for i, _id in block2]
981 -                if self.collect_stats:
982 -                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
983 -                self.stored_blocks.append((ind_block1, ind_block2))
984 -
985 -    def _iter_blocks(self):
986 -        """ Internal iteration function over blocks
987 -        """
988 -        for block1, block2 in self.stored_blocks:
989 -            if block1 and block2:
990 -                yield block1, block2
diff --git a/old_api.py b/old_api.py
@@ -1,432 +0,0 @@
991 -# -*- coding:utf-8 -*-
992 -#
993 -# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
994 -# contact http://www.logilab.fr -- mailto:contact@logilab.fr
995 -#
996 -# This program is free software: you can redistribute it and/or modify it under
997 -# the terms of the GNU Lesser General Public License as published by the Free
998 -# Software Foundation, either version 2.1 of the License, or (at your option)
999 -# any later version.
1000 -#
1001 -# This program is distributed in the hope that it will be useful, but WITHOUT
1002 -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1003 -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1004 -# details.
1005 -#
1006 -# You should have received a copy of the GNU Lesser General Public License along
1007 -# with this program. If not, see <http://www.gnu.org/licenses/>.
1008 -
1009 -from os import listdir
1010 -import os.path as osp
1011 -from shutil import rmtree
1012 -from tempfile import mkdtemp
1013 -import sys
1014 -import warnings
1015 -from functools import partial
1016 -
1017 -from scipy.sparse import lil_matrix
1018 -
1019 -from nazca.dataio import write_results, split_file, parsefile
1020 -from nazca.normalize import BaseNormalizer, NormalizerPipeline
1021 -from nazca.blocking import KmeansBlocking, KdTreeBlocking, MinHashingBlocking
1022 -from nazca.distances import GeographicalProcessing
1023 -from nazca.aligner import BaseAligner
1024 -
1025 -
1026 -# Backward compatibility. Now, use the BaseAligner inside the functions.
1027 -# Perhaps these functions may be removed later...
1028 -
1029 -
1030 -###############################################################################
1031 -### NORMALIZE FUNCTIONS #######################################################
1032 -###############################################################################
1033 -# Backward compatibility. Now, use the NormalizerPipeline inside the functions.
1034 -# Perhaps these functions may be removed later...
1035 -
1036 -def normalize_set(rset, processings):
1037 -    """ Apply all the normalization functions to the given rset """
1038 -    warnings.warn(DeprecationWarning('This function will be removed '
1039 -                                     'in the next release.'
1040 -                                     'You should rather use the BaseNormalizer '
1041 -                                     'object of the normalize module'))
1042 -    normalizers = []
1043 -    for ind, processing in processings.iteritems():
1044 -        for normalizer in extract_normalization_from_treatment(processing, ind):
1045 -            normalizers.append(normalizer)
1046 -    # Create pipeline
1047 -    pipeline = NormalizerPipeline(normalizers)
1048 -    return pipeline.normalize_dataset(rset)
1049 -
1050 -def extract_normalization_from_treatment(processing, ind):
1051 -    """ Extract normalization from processing.
1052 -    This function is used for backward compatibility with
1053 -    the old function-based API """
1054 -    warnings.warn(DeprecationWarning('This function will be removed '
1055 -                                     'in the next release.'
1056 -                                     'You should rather use the BaseNormalizer '
1057 -                                     'object of the normalize module'))
1058 -    for f in processing.get('normalization', []):
1059 -        farg = f.func_code.co_varnames #List of the arguments of f
1060 -        # A kind of union between the arguments needed by f, and the
1061 -        # provided ones
1062 -        givenargs = dict((arg, processing['norm_params'][arg])
1063 -                         for arg in farg if arg in processing.get('norm_params', []))
1064 -        callback = f
1065 -        if givenargs:
1066 -            callback = partial(callback, **givenargs)
1067 -        yield BaseNormalizer(callback=callback, attr_index=ind)
1068 -
1069 -def extract_treatment_from_treatment(processing, ind):
1070 -    """ Extract Treatment object from processing dict.
1071 -    This is only for backward compatibility with the old API.
1072 -    """
1073 -    if processing['metric'] == 'geographical':
1074 -        return GeographicalProcessing(ind, ind,
1075 -                                     matrix_normalized=processing.get('matrix_normalized', False),
1076 -                                     **processing.get('metric_params', {}))
1077 -
1078 -
1079 -###############################################################################
1080 -### ALIGNER ###################################################################
1081 -###############################################################################
1082 -def align(alignset, targetset, threshold, processings=None, resultfile=None,
1083 -          _applyNormalization=True):
1084 -    """ Try to align the items of alignset onto targetset's ones
1085 -
1086 -        `alignset` and `targetset` are the sets to align. Each set contains
1087 -        lists where the first column is the identifier of the item,
1088 -        and the others are
1089 -        the attributs to align. (Note that the order is important !) Both must
1090 -        have the same number of columns.
1091 -
1092 -        `processings` is a dictionary of dictionaries.
1093 -        Each key is the indice of the row, and each value is a dictionary
1094 -        that contains the processings to do on the different attributs.
1095 -        Each dictionary is built as the following:
1096 -
1097 -            processing = {'normalization': [f1, f2, f3],
1098 -                         'norm_params': {'arg1': arg01, 'arg2': arg02},
1099 -                         'metric': d1,
1100 -                         'metric_params': {'arg1': arg11},
1101 -                         'weighting': w,
1102 -                         'matrix_normalize': True
1103 -                        }
1104 -
1105 -            `normalization` is the list of functions called to normalize the
1106 -            given attribut (in order). Each functions is called with `norm_params`
1107 -            as arguments
1108 -
1109 -            Idem for `distance` and `distance_args`
1110 -
1111 -            `weighting` is the weighting for the current attribut in regard to
1112 -            the others
1113 -
1114 -            `resultfile` (default is None). Write the matched elements in a file.
1115 -
1116 -        Return the distance matrix and the matched list.
1117 -    """
1118 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
1119 -                                     'release.'
1120 -                                     ' You should rather use the BaseAligner '
1121 -                                     'object of the aligner module'))
1122 -    processings = processings or {}
1123 -    # Get the normalizers
1124 -    normalizers = []
1125 -    for ind, processing in processings.iteritems():
1126 -        for normalizer in extract_normalization_from_treatment(processing, ind):
1127 -            normalizers.append(normalizer)
1128 -    # Cleanup processings
1129 -    for t in processings.itervalues():
1130 -        if 'normalization' in t:
1131 -            t.pop('normalization')
1132 -        if 'norm_params' in t:
1133 -            t.pop('norm_params')
1134 -    # Build aligner
1135 -    processings = [extract_treatment_from_treatment(t, ind) for ind, t in processings.iteritems()]
1136 -    aligner = BaseAligner(threshold, processings)
1137 -    aligner.register_ref_normalizer(normalizers)
1138 -    aligner.register_target_normalizer(normalizers)
1139 -    # Align
1140 -    return aligner.align(alignset, targetset)
1141 -
1142 -def subalign(alignset, targetset, alignind, targetind, threshold,
1143 -             processings=None, _applyNormalization=True):
1144 -    """ Compute a subalignment for a list of indices of the alignset and
1145 -    a list of indices for the targetset """
1146 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
1147 -                                     'release.'
1148 -                                     ' You should rather use the BaseAligner '
1149 -                                     'object of the aligner module'))
1150 -    mat, matched = align([alignset[i[0]] for i in alignind],
1151 -                         [targetset[i[0]] for i in targetind], threshold,
1152 -                         processings, _applyNormalization=_applyNormalization)
1153 -    new_matched = {}
1154 -    for k, values in matched.iteritems():
1155 -        new_matched[alignind[k]] = [(targetind[i], d) for i, d in values]
1156 -    return mat, new_matched
1157 -
1158 -def conquer_and_divide_alignment(alignset, targetset, threshold, processings=None,
1159 -                                 indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
1160 -                                 n_clusters=None, kwordsgram=1, siglen=200,
1161 -                                 get_global_mat=True):
1162 -    """ Full conquer and divide method for alignment.
1163 -    Compute neighbours and merge the different subalignments.
1164 -    """
1165 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
1166 -                                     'release.'
1167 -                                     ' You should rather use the BaseAligner '
1168 -                                     'object of the aligner module'))
1169 -    global_matched = {}
1170 -    if get_global_mat:
1171 -        global_mat = lil_matrix((len(alignset), len(targetset)))
1172 -
1173 -    processings = processings or {}
1174 -    ralignset = normalize_set(alignset, processings)
1175 -    rtargetset = normalize_set(targetset, processings)
1176 -
1177 -    for alignind, targetind in findneighbours(ralignset, rtargetset, indexes, mode,
1178 -                                              neighbours_threshold, n_clusters,
1179 -                                              kwordsgram, siglen):
1180 -        _, matched = subalign(alignset, targetset, alignind, targetind,
1181 -                                threshold, processings, _applyNormalization=False)
1182 -        for k, values in matched.iteritems():
1183 -            subdict = global_matched.setdefault(k, set())
1184 -            for v, d in values:
1185 -                subdict.add((v, d))
1186 -                # XXX avoid issue in sparse matrix
1187 -                if get_global_mat:
1188 -                    global_mat[k[0], v[0]] = d or 10**(-10)
1189 -    if get_global_mat:
1190 -        return global_mat, global_matched
1191 -    return global_matched
1192 -
1193 -def alignall(alignset, targetset, threshold, processings=None,
1194 -             indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
1195 -             n_clusters=None, kwordsgram=1, siglen=200, uniq=False):
1196 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
1197 -                                     'release.'
1198 -                                     ' You should rather use the BaseAligner '
1199 -                                     'object of the aligner module'))
1200 -    if not mode:
1201 -        _, matched = align(alignset, targetset, threshold, processings,
1202 -                           resultfile=None, _applyNormalization=True)
1203 -    else:
1204 -        matched = conquer_and_divide_alignment(alignset, targetset, threshold,
1205 -                                               processings, indexes, mode,
1206 -                                               neighbours_threshold, n_clusters,
1207 -                                               kwordsgram, siglen,
1208 -                                               get_global_mat=False)
1209 -
1210 -    if not uniq:
1211 -        for alignid in matched:
1212 -            for targetid, _ in matched[alignid]:
1213 -                yield alignset[alignid[0]][0], targetset[targetid[0]][0]
1214 -    else:
1215 -        for alignid in matched:
1216 -            bestid, _ = sorted(matched[alignid], key=lambda x:x[1])[0]
1217 -            yield alignset[alignid[0]][0], targetset[bestid[0]][0]
1218 -
1219 -def alignall_iterative(alignfile, targetfile, alignformat, targetformat,
1220 -                       threshold, size=10000, equality_threshold=0.01,
1221 -                       processings=None, indexes=(1,1), mode='kdtree',
1222 -                       neighbours_threshold=0.1, n_clusters=None, kwordsgram=1,
1223 -                       siglen=200, cache=None):
1224 -    """ This function helps you to align *huge* files.
1225 -        It takes your csv files as arguments and split them into smaller ones
1226 -        (files of `size` lines), and runs the alignment on those files.
1227 -
1228 -        `alignformat` and `targetformat` are keyworded arguments given to the
1229 -        nazca.dataio.parsefile function.
1230 -
1231 -        This function returns its own cache. The cache is quite simply a
1232 -        dictionary having align items' id as keys and tuples (target item's id,
1233 -        distance) as value. This dictionary can be regiven to this function to
1234 -        perform another alignment (with different parameters, or just to be
1235 -        sure everything has been caught)
1236 -
1237 -        If the distance of an alignment is below `equality_threshold`, the
1238 -        alignment is considered as perfect, and the corresponding item is
1239 -        removed from the alignset (to speed up the computation).
1240 -    """
1241 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
1242 -                                     'release.'
1243 -                                     ' You should rather use the BaseAligner '
1244 -                                     'object of the aligner module'))
1245 -    #Split the huge files into smaller ones
1246 -    aligndir = mkdtemp()
1247 -    targetdir = mkdtemp()
1248 -    alignfiles = split_file(alignfile, aligndir, size)
1249 -    targetfiles = split_file(targetfile, targetdir, size)
1250 -
1251 -    #Compute the number of iterations that must be done to achieve the alignement
1252 -    nb_iterations = len(alignfiles) * len(targetfiles)
1253 -    current_it = 0
1254 -
1255 -    cache = cache or {} #Contains the better known alignements
1256 -    #Contains the id of perfectly aligned data
1257 -    doneids = set(_id for _id, (_, dist) in cache.iteritems()
1258 -                          if dist < equality_threshold)
1259 -
1260 -    try:
1261 -        for alignfile in alignfiles:
1262 -            alignset = [a for a in parsefile(osp.join(aligndir, alignfile), **alignformat)
1263 -                        if a[0] not in doneids]
1264 -            for targetfile in targetfiles:
1265 -                targetset = parsefile(osp.join(targetdir, targetfile), **targetformat)
1266 -                matched = conquer_and_divide_alignment(alignset, targetset,
1267 -                                                       threshold,
1268 -                                                       processings=processings,
1269 -                                                       indexes=indexes,
1270 -                                                       mode=mode,
1271 -                                                       neighbours_threshold=neighbours_threshold,
1272 -                                                       n_clusters=n_clusters,
1273 -                                                       kwordsgram=kwordsgram,
1274 -                                                       siglen=siglen,
1275 -                                                       get_global_mat=False)
1276 -                for alignid in matched:
1277 -                    bestid, dist = sorted(matched[alignid], key=lambda x:x[1])[0]
1278 -                    #Get the better known distance
1279 -                    _, current_dist = cache.get(alignset[alignid[0]][0], (None, None))
1280 -                    if current_dist is None or current_dist > dist:
1281 -                        #If it's better, update the cache
1282 -                        cache[alignset[alignid[0]][0]] = (targetset[bestid[0]][0], dist)
1283 -                        if dist <= equality_threshold:
1284 -                            #If perfect, stop trying to align this one
1285 -                            doneids.add(alignset[alignid][0])
1286 -
1287 -                current_it += 1
1288 -                sys.stdout.write('\r%0.2f%%' % (current_it * 100. /
1289 -                                                nb_iterations))
1290 -                sys.stdout.flush()
1291 -                if doneids:
1292 -                    alignset = [a for a in alignset if a[0] not in doneids]
1293 -                if not alignset: #All items have been aligned
1294 -                    #TODO Increment current_it.
1295 -                    #The progress of the alignment process is computed with
1296 -                    #`current_it`. If all items of `alignset` are aligned, we
1297 -                    #stop the alignment process for this `alignset`. If
1298 -                    #`current_it` isn’t incremented, the progress shown will be
1299 -                    #false.
1300 -                    break
1301 -
1302 -    finally:
1303 -        rmtree(aligndir)
1304 -        rmtree(targetdir)
1305 -
1306 -    return cache
1307 -
1308 -
1309 -
1310 -
1311 -
1312 -
1313 -
1314 -###############################################################################
1315 -### CLUSTERING-BASED BLOCKINGS FUNCTIONS ######################################
1316 -###############################################################################
1317 -# Backward compatibility. Now, use the BlockingObject inside the functions.
1318 -# Perhaps these functions may be removed later...
1319 -def findneighbours_clustering(alignset, targetset, indexes=(1, 1),
1320 -                              mode='kmeans', n_clusters=None):
1321 -    """ Find the neigbhours using clustering (kmeans or minibatchkmeans)
1322 -    """
1323 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
1324 -                                     'release.'
1325 -                                     ' You should rather use the KmeansBlocking '
1326 -                                     'object of the blocking module'))
1327 -    if mode == 'kmeans':
1328 -        blocking = KmeansBlocking(ref_attr_index=indexes[0],
1329 -                                  target_attr_index=indexes[1],
1330 -                                  n_clusters=n_clusters)
1331 -    elif mode == 'minibatch':
1332 -        blocking = MiniBatchKmeansBlocking(ref_attr_index=indexes[0],
1333 -                                           target_attr_index=indexes[1],
1334 -                                           n_clusters=n_clusters)
1335 -    else:
1336 -        raise ValueError("Mode should be 'kmeans' or 'minibatch'")
1337 -    # Fit blocking object
1338 -    blocking.fit(alignset, targetset)
1339 -    return list(blocking.iter_blocks())
1340 -
1341 -def findneighbours_kdtree(alignset, targetset, indexes=(1, 1), threshold=0.1):
1342 -    """ Find the neigbhours using kdree
1343 -    """
1344 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
1345 -                                     'release.'
1346 -                                     ' You should rather use the KdTreeBlocking '
1347 -                                     'object of the blocking module'))
1348 -    blocking = KdTreeBlocking(ref_attr_index=indexes[0],
1349 -                              target_attr_index=indexes[1],
1350 -                              threshold=threshold)
1351 -    blocking.fit(alignset, targetset)
1352 -    return list(blocking.iter_blocks())
1353 -
1354 -def findneighbours_minhashing(alignset, targetset, indexes=(1, 1), threshold=0.1,
1355 -                              kwordsgram=1, siglen=200):
1356 -    """ Find the neigbhours using minhashing
1357 -    """
1358 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
1359 -                                     'release.'
1360 -                                     ' You should rather use the '
1361 -                                     'MinHashingBlocking '
1362 -                                     'object of the blocking module'))
1363 -    blocking = MinHashingBlocking(ref_attr_index=indexes[0],
1364 -                                  target_attr_index=indexes[1],
1365 -                                  threshold=threshold, kwordsgram=kwordsgram,
1366 -                                  siglen=siglen)
1367 -    blocking.fit(alignset, targetset)
1368 -    return list(blocking.iter_blocks())
1369 -
1370 -def findneighbours(alignset, targetset, indexes=(1, 1), mode='kdtree',
1371 -                   neighbours_threshold=0.1, n_clusters=None, kwordsgram=1, siglen=200):
1372 -    """ This function helps to find neighbours from items of alignset and
1373 -        targetset. “Neighbours” are items that are “not so far”, ie having a
1374 -        close label, are located in the same area etc.
1375 -
1376 -        This function handles two types of neighbouring : text and numeric.
1377 -        For text value, you have to use the “minhashing” and for numeric, you
1378 -        can choose from “kdtree“, “kdmeans“ and “minibatch”
1379 -
1380 -        The arguments to give are :
1381 -            - `alignset` and `targetset` are the sets where neighbours have to
1382 -              be found.
1383 -            - `indexes` are the location of items to compare
1384 -            - `mode` is the search type to use
1385 -            - `neighbours_threshold` is the `mode` neighbours_threshold
1386 -
1387 -            - `n_clusters` is used for "kmeans" and "minibatch" methods, and it
1388 -              is the number of clusters to use.
1389 -
1390 -            - `kwordsgram` and `siglen` are used for "minhashing". `kwordsgram`
1391 -              is the length of wordsgrams to use, and `siglen` is the length of
1392 -              the minhashing signature matrix.
1393 -
1394 -        return a list of lists, built as the following :
1395 -            [
1396 -                [[indexes_of_alignset_0], [indexes_of_targetset_0]],
1397 -                [[indexes_of_alignset_1], [indexes_of_targetset_1]],
1398 -                [[indexes_of_alignset_2], [indexes_of_targetset_2]],
1399 -                [[indexes_of_alignset_3], [indexes_of_targetset_3]],
1400 -                ...
1401 -            ]
1402 -    """
1403 -    warnings.warn(DeprecationWarning('This function will be removed in the next '
1404 -                                     'release.'
1405 -                                     ' You should rather use the '
1406 -                                     'BaseBlocking '
1407 -                                     'objects of the blocking module'))
1408 -    SEARCHERS = set(['kdtree', 'minhashing', 'kmeans', 'minibatch'])
1409 -    mode = mode.lower()
1410 -
1411 -    if mode not in SEARCHERS:
1412 -        raise NotImplementedError('Unknown mode given')
1413 -    if mode == 'kdtree':
1414 -        return findneighbours_kdtree(alignset, targetset, indexes, neighbours_threshold)
1415 -    elif mode == 'minhashing':
1416 -        return findneighbours_minhashing(alignset, targetset, indexes, neighbours_threshold,
1417 -                                         kwordsgram, siglen)
1418 -    elif mode in set(['kmeans', 'minibatch']):
1419 -        try:
1420 -            return findneighbours_clustering(alignset, targetset, indexes, mode, n_clusters)
1421 -        except:
1422 -            raise NotImplementedError('Scikit learn does not seem to be installed')
diff --git a/record_linkage/__init__.py b/record_linkage/__init__.py
diff --git a/record_linkage/aligner.py b/record_linkage/aligner.py
@@ -0,0 +1,324 @@
1423 +# -*- coding:utf-8 -*-
1424 +# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1425 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1426 +#
1427 +# This program is free software: you can redistribute it and/or modify it under
1428 +# the terms of the GNU Lesser General Public License as published by the Free
1429 +# Software Foundation, either version 2.1 of the License, or (at your option)
1430 +# any later version.
1431 +#
1432 +# This program is distributed in the hope that it will be useful, but WITHOUT
1433 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1434 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1435 +# details.
1436 +#
1437 +# You should have received a copy of the GNU Lesser General Public License along
1438 +# with this program. If not, see <http://www.gnu.org/licenses/>.
1439 +import time
1440 +import logging
1441 +from collections import defaultdict
1442 +
1443 +from scipy import zeros
1444 +from scipy.sparse import lil_matrix
1445 +
1446 +from nazca.dataio import parsefile
1447 +
1448 +
1449 +###############################################################################
1450 +### UTILITY FUNCTIONS #########################################################
1451 +###############################################################################
1452 +def iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique=True):
1453 +    """ Return the aligned pairs
1454 +    """
1455 +    if unique:
1456 +        for refid in global_matched:
1457 +            bestid, _ = sorted(global_matched[refid], key=lambda x:x[1])[0]
1458 +            ref_record = refset[refid]
1459 +            target_record = targetset[bestid]
1460 +            distance = global_mat[refid, bestid] if global_mat is not None else None
1461 +            yield (ref_record[0], refid), (target_record[0], bestid), distance
1462 +    else:
1463 +        for refid in global_matched:
1464 +            for targetid, _ in global_matched[refid]:
1465 +                ref_record = refset[refid]
1466 +                target_record = targetset[targetid]
1467 +                distance = global_mat[refid, targetid] if global_mat is not None else None
1468 +                yield (ref_record[0], refid), (target_record[0], targetid), distance
1469 +
1470 +
1471 +###############################################################################
1472 +### BASE ALIGNER OBJECT #######################################################
1473 +###############################################################################
1474 +class BaseAligner(object):
1475 +
1476 +    def __init__(self, threshold, processings, normalize_matrix=False):
1477 +        self.threshold = threshold
1478 +        self.processings = processings
1479 +        self.normalize_matrix = normalize_matrix
1480 +        self.ref_normalizer = None
1481 +        self.target_normalizer = None
1482 +        self.target_normalizer = None
1483 +        self.blocking = None
1484 +        self.alignments_done = 0
1485 +        self.pairs_found = 0
1486 +        self.nb_comparisons = 0
1487 +        self.nb_blocks = 0
1488 +        self.refset_size = None
1489 +        self.targetset_size = None
1490 +        self.time = None
1491 +        self.logger = logging.getLogger('nazca.aligner')
1492 +
1493 +    def register_ref_normalizer(self, normalizer):
1494 +        """ Register normalizers to be applied
1495 +        before alignment """
1496 +        self.ref_normalizer = normalizer
1497 +
1498 +    def register_target_normalizer(self, normalizer):
1499 +        """ Register normalizers to be applied
1500 +        before alignment """
1501 +        self.target_normalizer = normalizer
1502 +
1503 +    def register_blocking(self, blocking):
1504 +        self.blocking = blocking
1505 +
1506 +    def apply_normalization(self, dataset, normalizer):
1507 +        if normalizer:
1508 +            return normalizer.normalize_dataset(dataset)
1509 +        return dataset
1510 +
1511 +    def compute_distance_matrix(self, refset, targetset,
1512 +                                ref_indexes, target_indexes):
1513 +        """ Compute and return the global alignment matrix.
1514 +        For each `processing` a `Distancematrix` is built, then all the
1515 +        matrices are summed with their own weighting and the result is the global
1516 +        alignment matrix, which is returned.
1517 +        """
1518 +        distmatrix = zeros((len(ref_indexes), len(target_indexes)), dtype='float32')
1519 +        for processing in self.processings:
1520 +            distmatrix += processing.cdist(refset, targetset,
1521 +                                          ref_indexes, target_indexes)
1522 +        return distmatrix
1523 +
1524 +    def threshold_matched(self, distmatrix):
1525 +        """ Return the matched elements within a dictionnary,
1526 +        each key being the indice from X, and the corresponding
1527 +        values being a list of couple (indice from Y, distance)
1528 +        """
1529 +        match = defaultdict(list)
1530 +        if self.normalize_matrix:
1531 +            distmatrix /= distmatrix.max()
1532 +        ind = (distmatrix <= self.threshold).nonzero()
1533 +        indrow = ind[0].tolist()
1534 +        indcol = ind[1].tolist()
1535 +        for (i, j) in zip(indrow, indcol):
1536 +            match[i].append((j, distmatrix[i, j]))
1537 +        return match
1538 +
1539 +    def _get_match(self, refset, targetset, ref_indexes=None, target_indexes=None):
1540 +        # Build items
1541 +        items = []
1542 +        ref_indexes = ref_indexes or xrange(len(refset))
1543 +        target_indexes = target_indexes or xrange(len(targetset))
1544 +        # Apply alignments
1545 +        mat = self.compute_distance_matrix(refset, targetset,
1546 +                                           ref_indexes=ref_indexes,
1547 +                                           target_indexes=target_indexes)
1548 +        matched = self.threshold_matched(mat)
1549 +        # Reapply matched to global indexes
1550 +        new_matched = {}
1551 +        for k, values in matched.iteritems():
1552 +            new_matched[ref_indexes[k]] = [(target_indexes[i], d) for i, d in values]
1553 +        return mat, new_matched
1554 +
1555 +    def align(self, refset, targetset, get_matrix=True):
1556 +        """ Perform the alignment on the referenceset
1557 +        and the targetset
1558 +        """
1559 +        start_time = time.time()
1560 +        refset = self.apply_normalization(refset, self.ref_normalizer)
1561 +        targetset = self.apply_normalization(targetset, self.target_normalizer)
1562 +        self.refset_size = len(refset)
1563 +        self.targetset_size = len(targetset)
1564 +        # If no blocking
1565 +        if not self.blocking:
1566 +            return self._get_match(refset, targetset)
1567 +        # Blocking == conquer_and_divide
1568 +        global_matched = {}
1569 +        global_mat = lil_matrix((len(refset), len(targetset)))
1570 +        self.blocking.fit(refset, targetset)
1571 +        for refblock, targetblock in self.blocking.iter_blocks():
1572 +            self.nb_blocks += 1
1573 +            ref_index = [r[0] for r in refblock]
1574 +            target_index = [r[0] for r in targetblock]
1575 +            self.nb_comparisons += len(ref_index)*len(target_index)
1576 +            _, matched = self._get_match(refset, targetset, ref_index, target_index)
1577 +            for k, values in matched.iteritems():
1578 +                subdict = global_matched.setdefault(k, set())
1579 +                for v, d in values:
1580 +                    subdict.add((v, d))
1581 +                    self.alignments_done += 1
1582 +                    if get_matrix:
1583 +                        # XXX avoid issue in sparse matrix
1584 +                        global_mat[k, v] = d or 10**(-10)
1585 +        self.time = time.time() - start_time
1586 +        return global_mat, global_matched
1587 +
1588 +    def get_aligned_pairs(self, refset, targetset, unique=True):
1589 +        """ Get the pairs of aligned elements
1590 +        """
1591 +        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
1592 +        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
1593 +            self.pairs_found += 1
1594 +            yield pair
1595 +        self.log_infos()
1596 +
1597 +    def align_from_files(self, reffile, targetfile,
1598 +                         ref_indexes=None, target_indexes=None,
1599 +                         ref_encoding=None, target_encoding=None,
1600 +                         ref_separator='\t', target_separator='\t',
1601 +                         get_matrix=True):
1602 +        """ Align data from files
1603 +
1604 +        Parameters
1605 +        ----------
1606 +
1607 +        reffile: name of the reference file
1608 +
1609 +        targetfile: name of the target file
1610 +
1611 +        ref_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
1612 +                      be used to read the files.
1613 +
1614 +        target_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
1615 +                         be used to read the files.
1616 +
1617 +        ref_separator: separator of the reference file
1618 +
1619 +        target_separator: separator of the target file
1620 +        """
1621 +        refset = parsefile(reffile, indexes=ref_indexes,
1622 +                           encoding=ref_encoding, delimiter=ref_separator)
1623 +        targetset = parsefile(targetfile, indexes=target_indexes,
1624 +                              encoding=target_encoding, delimiter=target_separator)
1625 +        return self.align(refset, targetset, get_matrix=get_matrix)
1626 +
1627 +    def get_aligned_pairs_from_files(self, reffile, targetfile,
1628 +                         ref_indexes=None, target_indexes=None,
1629 +                         ref_encoding=None, target_encoding=None,
1630 +                         ref_separator='\t', target_separator='\t',
1631 +                         unique=True):
1632 +        """ Get the pairs of aligned elements
1633 +        """
1634 +        refset = parsefile(reffile, indexes=ref_indexes,
1635 +                           encoding=ref_encoding, delimiter=ref_separator)
1636 +        targetset = parsefile(targetfile, indexes=target_indexes,
1637 +                              encoding=target_encoding, delimiter=target_separator)
1638 +        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
1639 +        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
1640 +            yield pair
1641 +
1642 +    def log_infos(self):
1643 +        """ Display some info on the aligner process
1644 +        """
1645 +        self.logger.info('Computation time : %s' % self.time)
1646 +        self.logger.info('Size reference set : %s' % self.refset_size)
1647 +        self.logger.info('Size target set : %s' % self.targetset_size)
1648 +        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
1649 +        self.logger.info('Alignments done : %s' % self.alignments_done)
1650 +        self.logger.info('Pairs found : %s' % self.pairs_found)
1651 +        self.logger.info('Ratio reference set/alignments done : %s'
1652 +                         % (self.alignments_done/float(self.refset_size)))
1653 +        self.logger.info('Ratio target set/alignments done : %s'
1654 +                         % (self.alignments_done/float(self.targetset_size)))
1655 +        self.logger.info('Ratio reference set/pairs found : %s'
1656 +                         % (self.pairs_found/float(self.refset_size)))
1657 +        self.logger.info('Ratio target set/pairs found : %s'
1658 +                         % (self.pairs_found/float(self.targetset_size)))
1659 +        self.logger.info('Maximum comparisons : %s'
1660 +                         % (self.refset_size * self.targetset_size))
1661 +        self.logger.info('Number of blocks : %s' % self.nb_blocks)
1662 +        if self.nb_blocks:
1663 +            self.logger.info('Ratio comparisons/block : %s'
1664 +                             % (float(self.nb_comparisons)/self.nb_blocks))
1665 +        self.logger.info('Blocking reduction : %s'
1666 +                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
1667 +
1668 +
1669 +###############################################################################
1670 +### PIPELINE ALIGNER OBJECT ##################################################
1671 +###############################################################################
1672 +class PipelineAligner(object):
1673 +    """ This pipeline will perform iterative alignments, removing each time
1674 +    the aligned results from the previous aligner.
1675 +    """
1676 +
1677 +    def __init__(self, aligners):
1678 +        self.aligners = aligners
1679 +        self.pairs = {}
1680 +        self.nb_comparisons = 0
1681 +        self.nb_blocks = 0
1682 +        self.alignments_done = 0
1683 +        self.pairs_found = 0
1684 +        self.refset_size = None
1685 +        self.targetset_size = None
1686 +        self.time = None
1687 +        self.logger = logging.getLogger('nazca.aligner')
1688 +
1689 +    def get_aligned_pairs(self, refset, targetset, unique=True):
1690 +        """ Get the pairs of aligned elements
1691 +        """
1692 +        start_time = time.time()
1693 +        ref_index = range(len(refset))
1694 +        target_index = range(len(targetset))
1695 +        self.refset_size = len(refset)
1696 +        self.targetset_size = len(targetset)
1697 +        global_matched = {}
1698 +        global_mat = lil_matrix((len(refset), len(targetset)))
1699 +        seen_refset = set()
1700 +        # Iteration over aligners
1701 +        for ind_aligner, aligner in enumerate(self.aligners):
1702 +            # Perform alignment
1703 +            _refset = [refset[i] for i in ref_index]
1704 +            _targetset = [targetset[i] for i in target_index]
1705 +            for pair in aligner.get_aligned_pairs(_refset, _targetset, unique):
1706 +                self.pairs_found += 1
1707 +                pair = ((pair[0][0], ref_index[pair[0][1]]),
1708 +                        (pair[1][0], target_index[pair[1][1]]))
1709 +                yield pair
1710 +                seen_refset.add(pair[0][1])
1711 +            # Store stats
1712 +            self.nb_blocks += aligner.nb_blocks
1713 +            self.nb_comparisons += aligner.nb_comparisons
1714 +            # Update indexes if necessary
1715 +            # For now, we remove all the reference set that are already matched
1716 +            if ind_aligner < len(self.aligners) - 1:
1717 +                # There are other aligners after this one
1718 +                ref_index = [i for i in ref_index if i not in seen_refset]
1719 +        self.time = time.time() - start_time
1720 +        self.log_infos()
1721 +
1722 +    def log_infos(self):
1723 +        """ Display some info on the aligner process
1724 +        """
1725 +        self.logger.info('Computation time : %s' % self.time)
1726 +        self.logger.info('Size reference set : %s' % self.refset_size)
1727 +        self.logger.info('Size target set : %s' % self.targetset_size)
1728 +        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
1729 +        self.logger.info('Alignments done : %s' % self.alignments_done)
1730 +        self.logger.info('Pairs found : %s' % self.pairs_found)
1731 +        self.logger.info('Ratio reference set/alignments done : %s'
1732 +                         % (self.alignments_done/float(self.refset_size)))
1733 +        self.logger.info('Ratio target set/alignments done : %s'
1734 +                         % (self.alignments_done/float(self.targetset_size)))
1735 +        self.logger.info('Ratio reference set/pairs found : %s'
1736 +                         % (self.pairs_found/float(self.refset_size)))
1737 +        self.logger.info('Ratio target set/pairs found : %s'
1738 +                         % (self.pairs_found/float(self.targetset_size)))
1739 +        self.logger.info('Maximum comparisons : %s'
1740 +                         % (self.refset_size * self.targetset_size))
1741 +        self.logger.info('Number of blocks : %s' % self.nb_blocks)
1742 +        if self.nb_blocks:
1743 +            self.logger.info('Ratio comparisons/block : %s'
1744 +                             % (float(self.nb_comparisons)/self.nb_blocks))
1745 +        self.logger.info('Blocking reduction : %s'
1746 +                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
diff --git a/record_linkage/blocking.py b/record_linkage/blocking.py
@@ -0,0 +1,666 @@
1747 +# -*- coding:utf-8 -*-
1748 +# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
1749 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
1750 +#
1751 +# This program is free software: you can redistribute it and/or modify it under
1752 +# the terms of the GNU Lesser General Public License as published by the Free
1753 +# Software Foundation, either version 2.1 of the License, or (at your option)
1754 +# any later version.
1755 +#
1756 +# This program is distributed in the hope that it will be useful, but WITHOUT
1757 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1758 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1759 +# details.
1760 +#
1761 +# You should have received a copy of the GNU Lesser General Public License along
1762 +# with this program. If not, see <http://www.gnu.org/licenses/>.
1763 +
1764 +
1765 +""" Blocking techniques.
1766 +
1767 +This module implements a set of blocking techniques used to split
1768 +datasets in smaller subsets that will be aligned in more details.
1769 +
1770 +Additional information:
1771 +
1772 +   P. Christen, Data Matching, Data-Centric Systems and Applications,
1773 +
1774 +
1775 +"""
1776 +from functools import partial
1777 +import warnings
1778 +
1779 +from scipy.spatial import KDTree
1780 +
1781 +from nazca.minhashing import Minlsh
1782 +from nazca.distances import soundexcode
1783 +
1784 +
1785 +###############################################################################
1786 +### GENERAL BLOCKING ##########################################################
1787 +###############################################################################
1788 +class BaseBlocking(object):
1789 +    """ An abstract general blocking object that exposes
1790 +    the API that should be common to all blockings object
1791 +    """
1792 +    def __init__(self, ref_attr_index, target_attr_index):
1793 +        """ Build the blocking object
1794 +
1795 +        Parameters
1796 +        ----------
1797 +
1798 +        ref_attr_index: index of the attribute of interest in a record
1799 +                        for the reference dataset
1800 +                        (i.e. attribute to be used for key computation)
1801 +
1802 +        target_attr_index: index of the attribute of interest in a record
1803 +                           for the target dataset
1804 +                           (i.e. attribute to be used for key computation)
1805 +        """
1806 +        self.ref_attr_index = ref_attr_index
1807 +        self.target_attr_index = target_attr_index
1808 +        self.refids = None
1809 +        self.targetids = None
1810 +        self.is_fitted = False
1811 +
1812 +    def _fit(self, refset, targetset):
1813 +        raise NotImplementedError
1814 +
1815 +    def _iter_blocks(self):
1816 +        """ Internal iteration function over blocks
1817 +        """
1818 +        raise NotImplementedError
1819 +
1820 +    def _cleanup(self):
1821 +        """ Internal cleanup blocking for further use (e.g. in pipeline)
1822 +        """
1823 +        raise NotImplementedError
1824 +
1825 +    def fit(self, refset, targetset):
1826 +        """ Fit the blocking technique on the reference and target datasets
1827 +
1828 +        Parameters
1829 +        ----------
1830 +        refset: a dataset (list of records)
1831 +
1832 +        targetset: a dataset (list of records)
1833 +        """
1834 +        self._fit(refset, targetset)
1835 +        # Keep ids for blocks building
1836 +        self.refids = [(i, r[0]) for i, r in enumerate(refset)]
1837 +        self.targetids = [(i, r[0]) for i, r in enumerate(targetset)]
1838 +        self.is_fitted = True
1839 +
1840 +    def iter_blocks(self):
1841 +        """ Iterator over the different possible blocks.
1842 +
1843 +        Returns
1844 +        -------
1845 +
1846 +        (block1, block2): The blocks are always (reference_block, target_block)
1847 +                          and contains the pair (index, id) of the record in the
1848 +                          corresponding dataset.
1849 +        """
1850 +        assert self.is_fitted
1851 +        return self._iter_blocks()
1852 +
1853 +    def iter_indice_blocks(self):
1854 +        """ Iterator over the different possible blocks.
1855 +
1856 +        Returns
1857 +        -------
1858 +
1859 +        (block1, block2): The blocks are always (reference_block, target_block)
1860 +                          and contains the indexes of the record in the
1861 +                          corresponding dataset.
1862 +        """
1863 +        assert self.is_fitted
1864 +        for block1, block2 in self._iter_blocks():
1865 +            yield [r[0] for r in block1], [r[0] for r in block2]
1866 +
1867 +    def iter_id_blocks(self):
1868 +        """ Iterator over the different possible blocks.
1869 +
1870 +        Returns
1871 +        -------
1872 +
1873 +        (block1, block2): The blocks are always (reference_block, target_block)
1874 +                          and contains the ids of the record in the
1875 +                          corresponding dataset.
1876 +        """
1877 +        assert self.is_fitted
1878 +        for block1, block2 in self._iter_blocks():
1879 +            yield [r[1] for r in block1], [r[1] for r in block2]
1880 +
1881 +    def iter_pairs(self):
1882 +        """ Iterator over the different possible pairs.
1883 +
1884 +        Returns
1885 +        -------
1886 +
1887 +        (pair1, pari2): The pairs are always ((ind_reference, id_reference),
1888 +                                              (ind_target, id_target))
1889 +                        and are the ids of the record in the corresponding dataset.
1890 +        """
1891 +        assert self.is_fitted
1892 +        for block1, block2 in self.iter_blocks():
1893 +            for val1 in block1:
1894 +                for val2 in block2:
1895 +                    yield val1, val2
1896 +
1897 +    def iter_indice_pairs(self):
1898 +        """ Iterator over the different possible pairs.
1899 +
1900 +        Returns
1901 +        -------
1902 +
1903 +        (pair1, pari2): The pairs are always (ind_reference, ind_target)
1904 +                        and are the ids of the record in the corresponding dataset.
1905 +        """
1906 +        assert self.is_fitted
1907 +        for block1, block2 in self.iter_indice_blocks():
1908 +            for val1 in block1:
1909 +                for val2 in block2:
1910 +                    yield val1, val2
1911 +
1912 +    def iter_id_pairs(self):
1913 +        """ Iterator over the different possible pairs.
1914 +
1915 +        Returns
1916 +        -------
1917 +
1918 +        (pair1, pari2): The pairs are always (id_reference, id_target)
1919 +                        and are the ids of the record in the corresponding dataset.
1920 +        """
1921 +        assert self.is_fitted
1922 +        for block1, block2 in self.iter_id_blocks():
1923 +            for val1 in block1:
1924 +                for val2 in block2:
1925 +                    yield val1, val2
1926 +
1927 +    def cleanup(self):
1928 +        """ Cleanup blocking for further use (e.g. in pipeline)
1929 +        """
1930 +        self.is_fitted = True
1931 +        self._cleanup()
1932 +
1933 +
1934 +###############################################################################
1935 +### KEY BLOCKING ##############################################################
1936 +###############################################################################
1937 +class KeyBlocking(BaseBlocking):
1938 +    """ This blocking technique is based on a a blocking criteria
1939 +    (or blocking key), that will be used to divide the datasets.
1940 +
1941 +    The main idea here is:
1942 +
1943 +    1 - to create an index of f(x) for each x in the reference set.
1944 +
1945 +    2 - to create an index of f(y) for each y in the target set.
1946 +
1947 +    3 - to iterate on each distinct value of f(x) and to return
1948 +        the identifiers of the records of the both sets for this value.
1949 +    """
1950 +
1951 +    def __init__(self, ref_attr_index, target_attr_index, callback, ignore_none=False):
1952 +        super(KeyBlocking, self).__init__(ref_attr_index, target_attr_index)
1953 +        self.callback = callback
1954 +        self.ignore_none = ignore_none
1955 +        self.reference_index = {}
1956 +        self.target_index = {}
1957 +
1958 +    def _fit(self, refset, targetset):
1959 +        """ Fit a dataset in an index using the callback
1960 +        """
1961 +        for ind, rec in enumerate(refset):
1962 +            key = self.callback(rec[self.ref_attr_index])
1963 +            if not key and self.ignore_none:
1964 +                continue
1965 +            self.reference_index.setdefault(key, []).append((ind, rec[0]))
1966 +        for ind, rec in enumerate(targetset):
1967 +            key = self.callback(rec[self.target_attr_index])
1968 +            if not key and self.ignore_none:
1969 +                continue
1970 +            self.target_index.setdefault(key, []).append((ind, rec[0]))
1971 +
1972 +    def _iter_blocks(self):
1973 +        """ Iterator over the different possible blocks.
1974 +
1975 +        Returns
1976 +        -------
1977 +
1978 +        (block1, block2): The blocks are always (reference_block, target_block)
1979 +                          and containts the indexes of the record in the
1980 +                          corresponding dataset.
1981 +        """
1982 +        for key, block1 in self.reference_index.iteritems():
1983 +            block2 = self.target_index.get(key)
1984 +            if block1 and block2:
1985 +                yield (block1, block2)
1986 +
1987 +    def _cleanup(self):
1988 +        """ Cleanup blocking for further use (e.g. in pipeline)
1989 +        """
1990 +        self.reference_index = {}
1991 +        self.target_index = {}
1992 +
1993 +
1994 +class SoundexBlocking(KeyBlocking):
1995 +
1996 +    def __init__(self, ref_attr_index, target_attr_index, language='french',):
1997 +        super(SoundexBlocking, self).__init__(ref_attr_index, target_attr_index,
1998 +                                              partial(soundexcode, language=language))
1999 +
2000 +
2001 +###############################################################################
2002 +### BIGRAM BLOCKING ###########################################################
2003 +###############################################################################
2004 +class NGramBlocking(BaseBlocking):
2005 +    """ This blocking technique is based on a a n-gram key.
2006 +    """
2007 +
2008 +    def __init__(self, ref_attr_index, target_attr_index, ngram_size=2, depth=2):
2009 +        super(NGramBlocking, self).__init__(ref_attr_index, target_attr_index)
2010 +        self.ngram_size = ngram_size
2011 +        self.depth = depth
2012 +        self.reference_index = {}
2013 +        self.target_index = {}
2014 +
2015 +    def _fit_dataset(self, dataset, cur_index, attr_index):
2016 +        """ Fit a dataset
2017 +        """
2018 +        for ind, r in enumerate(dataset):
2019 +            cur_dict = cur_index
2020 +            text = r[attr_index]
2021 +            for i in range(self.depth):
2022 +                ngram = text[i*self.ngram_size:(i+1)*self.ngram_size]
2023 +                if i < self.depth - 1:
2024 +                    cur_dict = cur_dict.setdefault(ngram, {})
2025 +            cur_dict.setdefault(ngram, []).append((ind, r[0]))
2026 +
2027 +    def _fit(self, refset, targetset):
2028 +        """ Fit the two sets (reference set and target set)
2029 +        """
2030 +        self._fit_dataset(refset, self.reference_index, self.ref_attr_index)
2031 +        self._fit_dataset(targetset, self.target_index, self.target_attr_index)
2032 +
2033 +    def _iter_dict(self, ref_cur_dict, target_cur_dict):
2034 +        """ Iterative function used to create blocks from dicts
2035 +        """
2036 +        for key, sub_dict in ref_cur_dict.iteritems():
2037 +            if key in target_cur_dict:
2038 +                if isinstance(sub_dict, dict):
2039 +                    # There is another dict layer
2040 +                    for block1, block2 in self._iter_dict(sub_dict, target_cur_dict[key]):
2041 +                        yield block1, block2
2042 +                else:
2043 +                    # This is a list
2044 +                    yield sub_dict, target_cur_dict[key]
2045 +
2046 +    def _iter_blocks(self):
2047 +        """ Iterator over the different possible blocks.
2048 +
2049 +        Returns
2050 +        -------
2051 +
2052 +        (block1, block2): The blocks are always (reference_block, target_block)
2053 +                          and containts the indexes of the record in the
2054 +                          corresponding dataset.
2055 +        """
2056 +        for block1, block2 in self._iter_dict(self.reference_index, self.target_index):
2057 +            if block1 and block2:
2058 +                yield block1, block2
2059 +
2060 +    def _cleanup(self):
2061 +        """ Cleanup blocking for further use (e.g. in pipeline)
2062 +        """
2063 +        self.reference_index = {}
2064 +        self.target_index = {}
2065 +
2066 +
2067 +###############################################################################
2068 +### SORTKEY BLOCKING ##########################################################
2069 +###############################################################################
2070 +class SortedNeighborhoodBlocking(BaseBlocking):
2071 +    """ This blocking technique is based on a a sorting blocking criteria
2072 +    (or blocking key), that will be used to divide the datasets.
2073 +    """
2074 +
2075 +    def __init__(self, ref_attr_index, target_attr_index, key_func=lambda x: x, window_width=20):
2076 +        super(SortedNeighborhoodBlocking, self).__init__(ref_attr_index, target_attr_index)
2077 +        self.key_func = key_func
2078 +        self.window_width = window_width
2079 +        self.sorted_dataset = None
2080 +
2081 +    def _fit(self, refset, targetset):
2082 +        """ Fit a dataset in an index using the callback
2083 +        """
2084 +        self.sorted_dataset = [((ind, r[0]), r[self.ref_attr_index], 0)
2085 +                               for ind, r in enumerate(refset)]
2086 +        self.sorted_dataset.extend([((ind, r[0]), r[self.target_attr_index], 1)
2087 +                                    for ind, r in enumerate(targetset)])
2088 +        self.sorted_dataset.sort(key=lambda x: self.key_func(x[1]))
2089 +
2090 +    def _iter_blocks(self):
2091 +        """ Iterator over the different possible blocks.
2092 +        """
2093 +        for ind, (rid, record, dset) in enumerate(self.sorted_dataset):
2094 +            # Only keep reference set record
2095 +            if dset == 1:
2096 +                continue
2097 +            block1 = [rid,]
2098 +            minind = (ind - self.window_width)
2099 +            minind = minind if minind >=0 else 0
2100 +            maxind = (ind + self.window_width + 1)
2101 +            block2 = [ri for ri, re, d in self.sorted_dataset[minind:maxind]
2102 +                      if d == 1]
2103 +            if block1 and block2:
2104 +                yield (block1, block2)
2105 +
2106 +    def _cleanup(self):
2107 +        """ Cleanup blocking for further use (e.g. in pipeline)
2108 +        """
2109 +        self.sorted_dataset = None
2110 +
2111 +
2112 +###############################################################################
2113 +### MERGE BLOCKING ############################################################
2114 +###############################################################################
2115 +class MergeBlocking(BaseBlocking):
2116 +    """ This blocking technique keep only one appearance of one given values,
2117 +    and removes all the other records having this value.
2118 +    The merge is based on a score function
2119 +
2120 +    E.g.
2121 +      ('http://fr.wikipedia.org/wiki/Paris_%28Texas%29', 'Paris', 25898)
2122 +      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
2123 +
2124 +    could be (with a score function based on the population (third value):
2125 +
2126 +      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
2127 +
2128 +    !!! WARNING !!! This is only done on ONE set (the one with a non null attr index)
2129 +    """
2130 +
2131 +    def __init__(self, ref_attr_index, target_attr_index, score_func):
2132 +        super(MergeBlocking, self).__init__(ref_attr_index, target_attr_index)
2133 +        self.score_func = score_func
2134 +        self.merged_dataset = None
2135 +        self.other_dataset = None
2136 +        if ref_attr_index is None and target_attr_index is None:
2137 +            raise ValueError('At least one of ref_attr_index or target_attr_index '
2138 +                             'should not be None')
2139 +
2140 +    def _fit(self, refset, targetset):
2141 +        """ Fit a dataset in an index using the callback
2142 +        """
2143 +        if self.ref_attr_index is not None:
2144 +            # Merge refset
2145 +            self.merged_dataset = self._merge_dataset(refset, self.ref_attr_index)
2146 +            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(targetset)]
2147 +        else:
2148 +            # Merge targetset
2149 +            self.merged_dataset = self._merge_dataset(targetset, self.target_attr_index)
2150 +            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(refset)]
2151 +
2152 +    def _merge_dataset(self, dataset, attr_index):
2153 +        """ Merge a dataset
2154 +        """
2155 +        merged_dataset_dict = {}
2156 +        for ind, record in enumerate(dataset):
2157 +            score = self.score_func(record)
2158 +            if record[attr_index] not in merged_dataset_dict:
2159 +                # Create new entry
2160 +                merged_dataset_dict[record[attr_index]] = (ind, record, score)
2161 +            elif (record[attr_index] in merged_dataset_dict
2162 +                  and merged_dataset_dict[record[attr_index]][2] < score):
2163 +                # Change current score
2164 +                merged_dataset_dict[record[attr_index]] = (ind, record, score)
2165 +        return [(ind, r[0]) for ind, r, score in merged_dataset_dict.itervalues()]
2166 +
2167 +    def _iter_blocks(self):
2168 +        """ Iterator over the different possible blocks.
2169 +        """
2170 +        if self.ref_attr_index is not None:
2171 +            yield self.merged_dataset, self.other_dataset
2172 +        else:
2173 +            # self.target_attr_index is not None
2174 +            yield self.other_dataset, self.merged_dataset
2175 +
2176 +    def _cleanup(self):
2177 +        """ Cleanup blocking for further use (e.g. in pipeline)
2178 +        """
2179 +        self.merged_dataset = None
2180 +        self.other_dataset = None
2181 +
2182 +
2183 +###############################################################################
2184 +### CLUSTERING-BASED BLOCKINGS ################################################
2185 +###############################################################################
2186 +class KmeansBlocking(BaseBlocking):
2187 +    """ A blocking technique based on Kmeans
2188 +    """
2189 +
2190 +    def __init__(self, ref_attr_index, target_attr_index, n_clusters=None):
2191 +        super(KmeansBlocking, self).__init__(ref_attr_index, target_attr_index)
2192 +        self.n_clusters = n_clusters
2193 +        self.kmeans = None
2194 +        self.predicted = None
2195 +        from sklearn import cluster
2196 +        self.cluster_class = cluster.KMeans
2197 +
2198 +    def _fit(self, refset, targetset):
2199 +        """ Fit the reference dataset.
2200 +        """
2201 +        # If an element is None (missing), use instead the identity element.
2202 +        # The identity element is defined as the 0-vector
2203 +        idelement = tuple([0 for _ in xrange(len(refset[0][self.ref_attr_index]))])
2204 +        # We assume here that there are at least 2 elements in the refset
2205 +        n_clusters = self.n_clusters or (len(refset)/10 or len(refset)/2)
2206 +        kmeans =  self.cluster_class(n_clusters=n_clusters)
2207 +        kmeans.fit([elt[self.ref_attr_index] or idelement for elt in refset])
2208 +        self.kmeans = kmeans
2209 +        # Predict on targetset
2210 +        self.predicted = self.kmeans.predict([elt[self.target_attr_index]
2211 +                                              or idelement for elt in targetset])
2212 +
2213 +    def _iter_blocks(self):
2214 +        """ Iterator over the different possible blocks.
2215 +
2216 +        Returns
2217 +        -------
2218 +
2219 +        (block1, block2): The blocks are always (reference_block, target_block)
2220 +                          and containts the indexes of the record in the
2221 +                          corresponding dataset.
2222 +        """
2223 +        neighbours = [[[], []] for _ in xrange(self.kmeans.n_clusters)]
2224 +        for ind, li in enumerate(self.predicted):
2225 +            neighbours[li][1].append(self.targetids[ind])
2226 +        for ind, li in enumerate(self.kmeans.labels_):
2227 +            neighbours[li][0].append(self.refids[ind])
2228 +        for block1, block2 in neighbours:
2229 +            if len(block1) and len(block2):
2230 +                yield block1, block2
2231 +
2232 +    def _cleanup(self):
2233 +        """ Cleanup blocking for further use (e.g. in pipeline)
2234 +        """
2235 +        self.kmeans = None
2236 +        self.predicted = None
2237 +
2238 +
2239 +###############################################################################
2240 +### KDTREE BLOCKINGS ##########################################################
2241 +###############################################################################
2242 +class KdTreeBlocking(BaseBlocking):
2243 +    """ A blocking technique based on KdTree
2244 +    """
2245 +    def __init__(self, ref_attr_index, target_attr_index, threshold=0.1):
2246 +        super(KdTreeBlocking, self).__init__(ref_attr_index, target_attr_index)
2247 +        self.threshold = threshold
2248 +        self.reftree = None
2249 +        self.targettree = None
2250 +        self.nb_elements = None
2251 +
2252 +    def _fit(self, refset, targetset):
2253 +        """ Fit the blocking
2254 +        """
2255 +        firstelement = refset[0][self.ref_attr_index]
2256 +        self.nb_elements = len(refset)
2257 +        idsize = len(firstelement) if isinstance(firstelement, (tuple, list)) else 1
2258 +        idelement = (0,) * idsize
2259 +        # KDTree is expecting a two-dimensional array
2260 +        if idsize == 1:
2261 +            self.reftree  = KDTree([(elt[self.ref_attr_index],) or idelement for elt in refset])
2262 +            self.targettree = KDTree([(elt[self.target_attr_index],) or idelement for elt in targetset])
2263 +        else:
2264 +            self.reftree = KDTree([elt[self.ref_attr_index] or idelement for elt in refset])
2265 +            self.targettree = KDTree([elt[self.target_attr_index] or idelement for elt in targetset])
2266 +
2267 +    def _iter_blocks(self):
2268 +        """ Iterator over the different possible blocks.
2269 +
2270 +        Returns
2271 +        -------
2272 +
2273 +        (block1, block2): The blocks are always (reference_block, target_block)
2274 +                          and containts the indexes of the record in the
2275 +                          corresponding dataset.
2276 +        """
2277 +        extraneighbours = self.reftree.query_ball_tree(self.targettree, self.threshold)
2278 +        neighbours = []
2279 +        for ind in xrange(self.nb_elements):
2280 +            if not extraneighbours[ind]:
2281 +                continue
2282 +            _ref = [self.refids[ind],]
2283 +            _target = [self.targetids[v] for v in extraneighbours[ind]]
2284 +            neighbours.append((_ref, _target))
2285 +        for block1, block2 in neighbours:
2286 +            if len(block1) and len(block2):
2287 +                yield block1, block2
2288 +
2289 +    def _cleanup(self):
2290 +        """ Cleanup blocking for further use (e.g. in pipeline)
2291 +        """
2292 +        self.reftree = None
2293 +        self.targettree = None
2294 +        self.nb_elements = None
2295 +
2296 +
2297 +###############################################################################
2298 +### MINHASHING BLOCKINGS ######################################################
2299 +###############################################################################
2300 +class MinHashingBlocking(BaseBlocking):
2301 +    """ A blocking technique based on MinHashing
2302 +    """
2303 +    def __init__(self, ref_attr_index, target_attr_index,
2304 +                 threshold=0.1, kwordsgram=1, siglen=200):
2305 +        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
2306 +        self.threshold = threshold
2307 +        self.kwordsgram = kwordsgram
2308 +        self.siglen = siglen
2309 +        self.minhasher = Minlsh()
2310 +        self.nb_elements = None
2311 +
2312 +    def _fit(self, refset, targetset):
2313 +        """ Find the blocking using minhashing
2314 +        """
2315 +        # If an element is None (missing), use instead the identity element.
2316 +        idelement = ''
2317 +        self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
2318 +                        [elt[self.target_attr_index] or idelement for elt in targetset],
2319 +                        self.kwordsgram, self.siglen)
2320 +        self.nb_elements = len(refset)
2321 +
2322 +    def _iter_blocks(self):
2323 +        """ Iterator over the different possible blocks.
2324 +
2325 +        Returns
2326 +        -------
2327 +
2328 +        (block1, block2): The blocks are always (reference_block, target_block)
2329 +                          and containts the indexes of the record in the
2330 +                          corresponding dataset.
2331 +        """
2332 +        rawneighbours = self.minhasher.predict(self.threshold)
2333 +        neighbours = []
2334 +        for data in rawneighbours:
2335 +            neighbours.append([[], []])
2336 +            for i in data:
2337 +                if i >= self.nb_elements:
2338 +                    neighbours[-1][1].append(self.targetids[i - self.nb_elements])
2339 +                else:
2340 +                    neighbours[-1][0].append(self.refids[i])
2341 +            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
2342 +                neighbours.pop()
2343 +        for block1, block2 in neighbours:
2344 +            if len(block1) and len(block2):
2345 +                yield block1, block2
2346 +
2347 +    def _cleanup(self):
2348 +        """ Cleanup blocking for further use (e.g. in pipeline)
2349 +        """
2350 +        self.minhasher = Minlsh()
2351 +        self.nb_elements = None
2352 +
2353 +
2354 +###############################################################################
2355 +### BLOCKING PIPELINE #########################################################
2356 +###############################################################################
2357 +class PipelineBlocking(BaseBlocking):
2358 +    """ Pipeline multiple blocking techniques
2359 +    """
2360 +
2361 +    def __init__(self, blockings, collect_stats=False):
2362 +        """ Build the blocking object
2363 +
2364 +        Parameters
2365 +        ----------
2366 +
2367 +        blockings: ordered list of blocking objects
2368 +        """
2369 +        self.blockings = blockings
2370 +        self.stored_blocks = []
2371 +        self.collect_stats = collect_stats
2372 +        self.stats = {}
2373 +
2374 +    def _fit(self, refset, targetset):
2375 +        """ Internal fit of the pipeline """
2376 +        self._recursive_fit(refset, targetset, range(len(refset)), range(len(targetset)), 0)
2377 +
2378 +    def _recursive_fit(self, refset, targetset, ref_index, target_index, ind):
2379 +        """ Recursive fit of the blockings.
2380 +        Blocks are stored in the stored_blocks attribute.
2381 +        """
2382 +        if ind < len(self.blockings) - 1:
2383 +            # There are other blockings after this one
2384 +            blocking = self.blockings[ind]
2385 +            blocking.cleanup()
2386 +            blocking.fit([refset[i] for i in ref_index],
2387 +                         [targetset[i] for i in target_index])
2388 +            for block1, block2 in blocking.iter_indice_blocks():
2389 +                ind_block1 = [ref_index[i] for i in block1]
2390 +                ind_block2 = [target_index[i] for i in block2]
2391 +                if self.collect_stats:
2392 +                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
2393 +                self._recursive_fit(refset, targetset, ind_block1, ind_block2, ind+1)
2394 +        else:
2395 +            # This is the final blocking
2396 +            blocking = self.blockings[ind]
2397 +            blocking.cleanup()
2398 +            blocking.fit([refset[i] for i in ref_index],
2399 +                         [targetset[i] for i in target_index])
2400 +            for block1, block2 in blocking.iter_blocks():
2401 +                ind_block1 = [(ref_index[i], _id) for i, _id in block1]
2402 +                ind_block2 = [(target_index[i], _id) for i, _id in block2]
2403 +                if self.collect_stats:
2404 +                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
2405 +                self.stored_blocks.append((ind_block1, ind_block2))
2406 +
2407 +    def _iter_blocks(self):
2408 +        """ Internal iteration function over blocks
2409 +        """
2410 +        for block1, block2 in self.stored_blocks:
2411 +            if block1 and block2:
2412 +                yield block1, block2
diff --git a/record_linkage/old_api.py b/record_linkage/old_api.py
@@ -0,0 +1,432 @@
2413 +# -*- coding:utf-8 -*-
2414 +#
2415 +# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
2416 +# contact http://www.logilab.fr -- mailto:contact@logilab.fr
2417 +#
2418 +# This program is free software: you can redistribute it and/or modify it under
2419 +# the terms of the GNU Lesser General Public License as published by the Free
2420 +# Software Foundation, either version 2.1 of the License, or (at your option)
2421 +# any later version.
2422 +#
2423 +# This program is distributed in the hope that it will be useful, but WITHOUT
2424 +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
2425 +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
2426 +# details.
2427 +#
2428 +# You should have received a copy of the GNU Lesser General Public License along
2429 +# with this program. If not, see <http://www.gnu.org/licenses/>.
2430 +
2431 +from os import listdir
2432 +import os.path as osp
2433 +from shutil import rmtree
2434 +from tempfile import mkdtemp
2435 +import sys
2436 +import warnings
2437 +from functools import partial
2438 +
2439 +from scipy.sparse import lil_matrix
2440 +
2441 +from nazca.dataio import write_results, split_file, parsefile
2442 +from nazca.normalize import BaseNormalizer, NormalizerPipeline
2443 +from nazca.distances import GeographicalProcessing
2444 +from nazca.record_linkage.aligner import BaseAligner
2445 +from nazca.record_linkage.blocking import KmeansBlocking, KdTreeBlocking, MinHashingBlocking
2446 +
2447 +
2448 +# Backward compatibility. Now, use the BaseAligner inside the functions.
2449 +# Perhaps these functions may be removed later...
2450 +
2451 +
2452 +###############################################################################
2453 +### NORMALIZE FUNCTIONS #######################################################
2454 +###############################################################################
2455 +# Backward compatibility. Now, use the NormalizerPipeline inside the functions.
2456 +# Perhaps these functions may be removed later...
2457 +
2458 +def normalize_set(rset, processings):
2459 +    """ Apply all the normalization functions to the given rset """
2460 +    warnings.warn(DeprecationWarning('This function will be removed '
2461 +                                     'in the next release.'
2462 +                                     'You should rather use the BaseNormalizer '
2463 +                                     'object of the normalize module'))
2464 +    normalizers = []
2465 +    for ind, processing in processings.iteritems():
2466 +        for normalizer in extract_normalization_from_treatment(processing, ind):
2467 +            normalizers.append(normalizer)
2468 +    # Create pipeline
2469 +    pipeline = NormalizerPipeline(normalizers)
2470 +    return pipeline.normalize_dataset(rset)
2471 +
2472 +def extract_normalization_from_treatment(processing, ind):
2473 +    """ Extract normalization from processing.
2474 +    This function is used for backward compatibility with
2475 +    the old function-based API """
2476 +    warnings.warn(DeprecationWarning('This function will be removed '
2477 +                                     'in the next release.'
2478 +                                     'You should rather use the BaseNormalizer '
2479 +                                     'object of the normalize module'))
2480 +    for f in processing.get('normalization', []):
2481 +        farg = f.func_code.co_varnames #List of the arguments of f
2482 +        # A kind of union between the arguments needed by f, and the
2483 +        # provided ones
2484 +        givenargs = dict((arg, processing['norm_params'][arg])
2485 +                         for arg in farg if arg in processing.get('norm_params', []))
2486 +        callback = f
2487 +        if givenargs:
2488 +            callback = partial(callback, **givenargs)
2489 +        yield BaseNormalizer(callback=callback, attr_index=ind)
2490 +
2491 +def extract_treatment_from_treatment(processing, ind):
2492 +    """ Extract Treatment object from processing dict.
2493 +    This is only for backward compatibility with the old API.
2494 +    """
2495 +    if processing['metric'] == 'geographical':
2496 +        return GeographicalProcessing(ind, ind,
2497 +                                     matrix_normalized=processing.get('matrix_normalized', False),
2498 +                                     **processing.get('metric_params', {}))
2499 +
2500 +
2501 +###############################################################################
2502 +### ALIGNER ###################################################################
2503 +###############################################################################
2504 +def align(alignset, targetset, threshold, processings=None, resultfile=None,
2505 +          _applyNormalization=True):
2506 +    """ Try to align the items of alignset onto targetset's ones
2507 +
2508 +        `alignset` and `targetset` are the sets to align. Each set contains
2509 +        lists where the first column is the identifier of the item,
2510 +        and the others are
2511 +        the attributs to align. (Note that the order is important !) Both must
2512 +        have the same number of columns.
2513 +
2514 +        `processings` is a dictionary of dictionaries.
2515 +        Each key is the indice of the row, and each value is a dictionary
2516 +        that contains the processings to do on the different attributs.
2517 +        Each dictionary is built as the following:
2518 +
2519 +            processing = {'normalization': [f1, f2, f3],
2520 +                         'norm_params': {'arg1': arg01, 'arg2': arg02},
2521 +                         'metric': d1,
2522 +                         'metric_params': {'arg1': arg11},
2523 +                         'weighting': w,
2524 +                         'matrix_normalize': True
2525 +                        }
2526 +
2527 +            `normalization` is the list of functions called to normalize the
2528 +            given attribut (in order). Each functions is called with `norm_params`
2529 +            as arguments
2530 +
2531 +            Idem for `distance` and `distance_args`
2532 +
2533 +            `weighting` is the weighting for the current attribut in regard to
2534 +            the others
2535 +
2536 +            `resultfile` (default is None). Write the matched elements in a file.
2537 +
2538 +        Return the distance matrix and the matched list.
2539 +    """
2540 +    warnings.warn(DeprecationWarning('This function will be removed in the next '
2541 +                                     'release.'
2542 +                                     ' You should rather use the BaseAligner '
2543 +                                     'object of the aligner module'))
2544 +    processings = processings or {}
2545 +    # Get the normalizers
2546 +    normalizers = []
2547 +    for ind, processing in processings.iteritems():
2548 +        for normalizer in extract_normalization_from_treatment(processing, ind):
2549 +            normalizers.append(normalizer)
2550 +    # Cleanup processings
2551 +    for t in processings.itervalues():
2552 +        if 'normalization' in t:
2553 +            t.pop('normalization')
2554 +        if 'norm_params' in t:
2555 +            t.pop('norm_params')
2556 +    # Build aligner
2557 +    processings = [extract_treatment_from_treatment(t, ind) for ind, t in processings.iteritems()]
2558 +    aligner = BaseAligner(threshold, processings)
2559 +    aligner.register_ref_normalizer(normalizers)
2560 +    aligner.register_target_normalizer(normalizers)
2561 +    # Align
2562 +    return aligner.align(alignset, targetset)
2563 +
2564 +def subalign(alignset, targetset, alignind, targetind, threshold,
2565 +             processings=None, _applyNormalization=True):
2566 +    """ Compute a subalignment for a list of indices of the alignset and
2567 +    a list of indices for the targetset """
2568 +    warnings.warn(DeprecationWarning('This function will be removed in the next '
2569 +                                     'release.'
2570 +                                     ' You should rather use the BaseAligner '
2571 +                                     'object of the aligner module'))
2572 +    mat, matched = align([alignset[i[0]] for i in alignind],
2573 +                         [targetset[i[0]] for i in targetind], threshold,
2574 +                         processings, _applyNormalization=_applyNormalization)
2575 +    new_matched = {}
2576 +    for k, values in matched.iteritems():
2577 +        new_matched[alignind[k]] = [(targetind[i], d) for i, d in values]
2578 +    return mat, new_matched
2579 +
2580 +def conquer_and_divide_alignment(alignset, targetset, threshold, processings=None,
2581 +                                 indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
2582 +                                 n_clusters=None, kwordsgram=1, siglen=200,
2583 +                                 get_global_mat=True):
2584 +    """ Full conquer and divide method for alignment.
2585 +    Compute neighbours and merge the different subalignments.
2586 +    """
2587 +    warnings.warn(DeprecationWarning('This function will be removed in the next '
2588 +                                     'release.'
2589 +                                     ' You should rather use the BaseAligner '
2590 +                                     'object of the aligner module'))
2591 +    global_matched = {}
2592 +    if get_global_mat:
2593 +        global_mat = lil_matrix((len(alignset), len(targetset)))
2594 +
2595 +    processings = processings or {}
2596 +    ralignset = normalize_set(alignset, processings)
2597 +    rtargetset = normalize_set(targetset, processings)
2598 +
2599 +    for alignind, targetind in findneighbours(ralignset, rtargetset, indexes, mode,
2600 +                                              neighbours_threshold, n_clusters,
2601 +                                              kwordsgram, siglen):
2602 +        _, matched = subalign(alignset, targetset, alignind, targetind,
2603 +                                threshold, processings, _applyNormalization=False)
2604 +        for k, values in matched.iteritems():
2605 +            subdict = global_matched.setdefault(k, set())
2606 +            for v, d in values:
2607 +                subdict.add((v, d))
2608 +                # XXX avoid issue in sparse matrix
2609 +                if get_global_mat:
2610 +                    global_mat[k[0], v[0]] = d or 10**(-10)
2611 +    if get_global_mat:
2612 +        return global_mat, global_matched
2613 +    return global_matched
2614 +
2615 +def alignall(alignset, targetset, threshold, processings=None,
2616 +             indexes=(1,1), mode='kdtree', neighbours_threshold=0.1,
2617 +             n_clusters=None, kwordsgram=1, siglen=200, uniq=False):
2618 +    warnings.warn(DeprecationWarning('This function will be removed in the next '
2619 +                                     'release.'
2620 +                                     ' You should rather use the BaseAligner '
2621 +                                     'object of the aligner module'))
2622 +    if not mode:
2623 +        _, matched = align(alignset, targetset, threshold, processings,
2624 +                           resultfile=None, _applyNormalization=True)
2625 +    else:
2626 +        matched = conquer_and_divide_alignment(alignset, targetset, threshold,
2627 +                                               processings, indexes, mode,
2628 +                                               neighbours_threshold, n_clusters,
2629 +                                               kwordsgram, siglen,
2630 +                                               get_global_mat=False)
2631 +
2632 +    if not uniq:
2633 +        for alignid in matched:
2634 +            for targetid, _ in matched[alignid]:
2635 +                yield alignset[alignid[0]][0], targetset[targetid[0]][0]
2636 +    else:
2637 +        for alignid in matched:
2638 +            bestid, _ = sorted(matched[alignid], key=lambda x:x[1])[0]
2639 +            yield alignset[alignid[0]][0], targetset[bestid[0]][0]
2640 +
2641 +def alignall_iterative(alignfile, targetfile, alignformat, targetformat,
2642 +                       threshold, size=10000, equality_threshold=0.01,
2643 +                       processings=None, indexes=(1,1), mode='kdtree',
2644 +                       neighbours_threshold=0.1, n_clusters=None, kwordsgram=1,
2645 +                       siglen=200, cache=None):
2646 +    """ This function helps you to align *huge* files.
2647 +        It takes your csv files as arguments and split them into smaller ones
2648 +        (files of `size` lines), and runs the alignment on those files.
2649 +
2650 +        `alignformat` and `targetformat` are keyworded arguments given to the
2651 +        nazca.dataio.parsefile function.
2652 +
2653 +        This function returns its own cache. The cache is quite simply a
2654 +        dictionary having align items' id as keys and tuples (target item's id,
2655 +        distance) as value. This dictionary can be regiven to this function to
2656 +        perform another alignment (with different parameters, or just to be
2657 +        sure everything has been caught)
2658 +
2659 +        If the distance of an alignment is below `equality_threshold`, the
2660 +        alignment is considered as perfect, and the corresponding item is
2661 +        removed from the alignset (to speed up the computation).
2662 +    """
2663 +    warnings.warn(DeprecationWarning('This function will be removed in the next '
2664 +                                     'release.'
2665 +                                     ' You should rather use the BaseAligner '
2666 +                                     'object of the aligner module'))
2667 +    #Split the huge files into smaller ones
2668 +    aligndir = mkdtemp()
2669 +    targetdir = mkdtemp()
2670 +    alignfiles = split_file(alignfile, aligndir, size)
2671 +    targetfiles = split_file(targetfile, targetdir, size)
2672 +
2673 +    #Compute the number of iterations that must be done to achieve the alignement
2674 +    nb_iterations = len(alignfiles) * len(targetfiles)
2675 +    current_it = 0
2676 +
2677 +    cache = cache or {} #Contains the better known alignements
2678 +    #Contains the id of perfectly aligned data
2679 +    doneids = set(_id for _id, (_, dist) in cache.iteritems()
2680 +                          if dist < equality_threshold)
2681 +
2682 +    try:
2683 +        for alignfile in alignfiles:
2684 +            alignset = [a for a in parsefile(osp.join(aligndir, alignfile), **alignformat)
2685 +                        if a[0] not in doneids]
2686 +            for targetfile in targetfiles:
2687 +                targetset = parsefile(osp.join(targetdir, targetfile), **targetformat)
2688 +                matched = conquer_and_divide_alignment(alignset, targetset,
2689 +                                                       threshold,
2690 +                                                       processings=processings,
2691 +                                                       indexes=indexes,
2692 +                                                       mode=mode,
2693 +                                                       neighbours_threshold=neighbours_threshold,
2694 +                                                       n_clusters=n_clusters,
2695 +                                                       kwordsgram=kwordsgram,
2696 +                                                       siglen=siglen,
2697 +                                                       get_global_mat=False)
2698 +                for alignid in matched:
2699 +                    bestid, dist = sorted(matched[alignid], key=lambda x:x[1])[0]
2700 +                    #Get the better known distance
2701 +                    _, current_dist = cache.get(alignset[alignid[0]][0], (None, None))
2702 +                    if current_dist is None or current_dist > dist:
2703 +                        #If it's better, update the cache
2704 +                        cache[alignset[alignid[0]][0]] = (targetset[bestid[0]][0], dist)
2705 +                        if dist <= equality_threshold:
2706 +                            #If perfect, stop trying to align this one
2707 +                            doneids.add(alignset[alignid][0])
2708 +
2709 +                current_it += 1
2710 +                sys.stdout.write('\r%0.2f%%' % (current_it * 100. /
2711 +                                                nb_iterations))
2712 +                sys.stdout.flush()
2713 +                if doneids:
2714 +                    alignset = [a for a in alignset if a[0] not in doneids]
2715 +                if not alignset: #All items have been aligned
2716 +                    #TODO Increment current_it.
2717 +                    #The progress of the alignment process is computed with
2718 +                    #`current_it`. If all items of `alignset` are aligned, we
2719 +                    #stop the alignment process for this `alignset`. If
2720 +                    #`current_it` isn’t incremented, the progress shown will be
2721 +                    #false.
2722 +                    break
2723 +
2724 +    finally:
2725 +        rmtree(aligndir)
2726 +        rmtree(targetdir)
2727 +
2728 +    return cache
2729 +
2730 +
2731 +
2732 +
2733 +
2734 +
2735 +
2736 +###############################################################################
2737 +### CLUSTERING-BASED BLOCKINGS FUNCTIONS ######################################
2738 +###############################################################################
2739 +# Backward compatibility. Now, use the BlockingObject inside the functions.
2740 +# Perhaps these functions may be removed later...
2741 +def findneighbours_clustering(alignset, targetset, indexes=(1, 1),
2742 +                              mode='kmeans', n_clusters=None):
2743 +    """ Find the neigbhours using clustering (kmeans or minibatchkmeans)
2744 +    """
2745 +    warnings.warn(DeprecationWarning('This function will be removed in the next '
2746 +                                     'release.'
2747 +                                     ' You should rather use the KmeansBlocking '
2748 +                                     'object of the blocking module'))
2749 +    if mode == 'kmeans':
2750 +        blocking = KmeansBlocking(ref_attr_index=indexes[0],
2751 +                                  target_attr_index=indexes[1],
2752 +                                  n_clusters=n_clusters)
2753 +    elif mode == 'minibatch':
2754 +        blocking = MiniBatchKmeansBlocking(ref_attr_index=indexes[0],
2755 +                                           target_attr_index=indexes[1],
2756 +                                           n_clusters=n_clusters)
2757 +    else:
2758 +        raise ValueError("Mode should be 'kmeans' or 'minibatch'")
2759 +    # Fit blocking object
2760 +    blocking.fit(alignset, targetset)
2761 +    return list(blocking.iter_blocks())
2762 +
2763 +def findneighbours_kdtree(alignset, targetset, indexes=(1, 1), threshold=0.1):
2764 +    """ Find the neigbhours using kdree
2765 +    """
2766 +    warnings.warn(DeprecationWarning('This function will be removed in the next '
2767 +                                     'release.'
2768 +                                     ' You should rather use the KdTreeBlocking '
2769 +                                     'object of the blocking module'))
2770 +    blocking = KdTreeBlocking(ref_attr_index=indexes[0],
2771 +                              target_attr_index=indexes[1],
2772 +                              threshold=threshold)
2773 +    blocking.fit(alignset, targetset)
2774 +    return list(blocking.iter_blocks())
2775 +
2776 +def findneighbours_minhashing(alignset, targetset, indexes=(1, 1), threshold=0.1,
2777 +                              kwordsgram=1, siglen=200):
2778 +    """ Find the neigbhours using minhashing
2779 +    """
2780 +    warnings.warn(DeprecationWarning('This function will be removed in the next '
2781 +                                     'release.'
2782 +                                     ' You should rather use the '
2783 +                                     'MinHashingBlocking '
2784 +                                     'object of the blocking module'))
2785 +    blocking = MinHashingBlocking(ref_attr_index=indexes[0],
2786 +                                  target_attr_index=indexes[1],
2787 +                                  threshold=threshold, kwordsgram=kwordsgram,
2788 +                                  siglen=siglen)
2789 +    blocking.fit(alignset, targetset)
2790 +    return list(blocking.iter_blocks())
2791 +
2792 +def findneighbours(alignset, targetset, indexes=(1, 1), mode='kdtree',
2793 +                   neighbours_threshold=0.1, n_clusters=None, kwordsgram=1, siglen=200):
2794 +    """ This function helps to find neighbours from items of alignset and
2795 +        targetset. “Neighbours” are items that are “not so far”, ie having a
2796 +        close label, are located in the same area etc.
2797 +
2798 +        This function handles two types of neighbouring : text and numeric.
2799 +        For text value, you have to use the “minhashing” and for numeric, you
2800 +        can choose from “kdtree“, “kdmeans“ and “minibatch”
2801 +
2802 +        The arguments to give are :
2803 +            - `alignset` and `targetset` are the sets where neighbours have to
2804 +              be found.
2805 +            - `indexes` are the location of items to compare
2806 +            - `mode` is the search type to use
2807 +            - `neighbours_threshold` is the `mode` neighbours_threshold
2808 +
2809 +            - `n_clusters` is used for "kmeans" and "minibatch" methods, and it
2810 +              is the number of clusters to use.
2811 +
2812 +            - `kwordsgram` and `siglen` are used for "minhashing". `kwordsgram`
2813 +              is the length of wordsgrams to use, and `siglen` is the length of
2814 +              the minhashing signature matrix.
2815 +
2816 +        return a list of lists, built as the following :
2817 +            [
2818 +                [[indexes_of_alignset_0], [indexes_of_targetset_0]],
2819 +                [[indexes_of_alignset_1], [indexes_of_targetset_1]],
2820 +                [[indexes_of_alignset_2], [indexes_of_targetset_2]],
2821 +                [[indexes_of_alignset_3], [indexes_of_targetset_3]],
2822 +                ...
2823 +            ]
2824 +    """
2825 +    warnings.warn(DeprecationWarning('This function will be removed in the next '
2826 +                                     'release.'
2827 +                                     ' You should rather use the '
2828 +                                     'BaseBlocking '
2829 +                                     'objects of the blocking module'))
2830 +    SEARCHERS = set(['kdtree', 'minhashing', 'kmeans', 'minibatch'])
2831 +    mode = mode.lower()
2832 +
2833 +    if mode not in SEARCHERS:
2834 +        raise NotImplementedError('Unknown mode given')
2835 +    if mode == 'kdtree':
2836 +        return findneighbours_kdtree(alignset, targetset, indexes, neighbours_threshold)
2837 +    elif mode == 'minhashing':
2838 +        return findneighbours_minhashing(alignset, targetset, indexes, neighbours_threshold,
2839 +                                         kwordsgram, siglen)
2840 +    elif mode in set(['kmeans', 'minibatch']):
2841 +        try:
2842 +            return findneighbours_clustering(alignset, targetset, indexes, mode, n_clusters)
2843 +        except:
2844 +            raise NotImplementedError('Scikit learn does not seem to be installed')
diff --git a/test/test_alignment.py b/test/test_alignment.py
@@ -20,12 +20,12 @@
2845  import random
2846  random.seed(6) ### Make sure tests are repeatable
2847  from os import path
2848 
2849  from nazca.normalize import simplify
2850 -import nazca.aligner as alig
2851 -import nazca.blocking as blo
2852 +import nazca.record_linkage.aligner as alig
2853 +import nazca.record_linkage.blocking as blo
2854  from nazca.distances import LevenshteinProcessing, GeographicalProcessing
2855 
2856 
2857  TESTDIR = path.dirname(__file__)
2858 
diff --git a/test/test_blocking.py b/test/test_blocking.py
@@ -21,15 +21,15 @@
2859  import random
2860  random.seed(6) ### Make sure tests are repeatable / Minhashing
2861 
2862  from nazca.distances import (levenshtein, soundex, soundexcode,   \
2863                               jaccard, euclidean, geographical)
2864 -from nazca.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
2865 -                            MergeBlocking,
2866 -                            NGramBlocking, PipelineBlocking,
2867 -                            SoundexBlocking, KmeansBlocking,
2868 -                            MinHashingBlocking, KdTreeBlocking)
2869 +from nazca.record_linkage.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
2870 +                                           MergeBlocking,
2871 +                                           NGramBlocking, PipelineBlocking,
2872 +                                           SoundexBlocking, KmeansBlocking,
2873 +                                           MinHashingBlocking, KdTreeBlocking)
2874  from nazca.normalize import SimplifyNormalizer, loadlemmas
2875 
2876 
2877  TESTDIR = path.dirname(__file__)
2878 
diff --git a/test/test_old_api.py b/test/test_old_api.py
@@ -20,17 +20,17 @@
2879  import random
2880  random.seed(6) ### Make sure tests are repeatable
2881  from os import path
2882 
2883  from nazca.normalize import loadlemmas, simplify
2884 -from nazca.old_api import (normalize_set,
2885 -                           findneighbours_clustering,
2886 -                           findneighbours_kdtree,
2887 -                           findneighbours_minhashing,
2888 -                           align, subalign,
2889 -                           conquer_and_divide_alignment,
2890 -                           alignall, alignall_iterative)
2891 +from nazca.record_linkage.old_api import (normalize_set,
2892 +                                          findneighbours_clustering,
2893 +                                          findneighbours_kdtree,
2894 +                                          findneighbours_minhashing,
2895 +                                          align, subalign,
2896 +                                          conquer_and_divide_alignment,
2897 +                                          alignall, alignall_iterative)
2898 
2899 
2900  TESTDIR = path.dirname(__file__)
2901 
2902