[distances] Add an ExactMatchProcessing, closes #252734

authorVincent Michel <vincent.michel@logilab.fr>
changeset672dbc5cdeee
branchdefault
phasedraft
hiddenyes
parent revision#69174102ac53 [blocking] Add compatibility for older version of sklearn, closes #252733
child revision#30b35c628028 [aligner] Keep the distance matrix by default, closes #252735
files modified by this revision
test/test_distances.py
utils/distances.py
# HG changeset patch
# User Vincent Michel <vincent.michel@logilab.fr>
# Date 1401196402 -7200
# Tue May 27 15:13:22 2014 +0200
# Node ID 672dbc5cdeeef6fb69c8b106e1c298a0facad7e0
# Parent 69174102ac5313f81b656dbf70865390cf78fd8c
[distances] Add an ExactMatchProcessing, closes #252734

diff --git a/test/test_distances.py b/test/test_distances.py
@@ -26,10 +26,11 @@
1  from dateutil import parser as dateparser
2 
3  from nazca.utils.distances import (levenshtein, soundex, soundexcode,
4                                     difflib_match,
5                                     jaccard, euclidean, geographical,
6 +                                   ExactMatchProcessing,
7                                     LevenshteinProcessing, SoundexProcessing,
8                                     JaccardProcessing, DifflibProcessing, TemporalProcessing)
9 
10 
11  class DistancesTest(unittest.TestCase):
@@ -134,10 +135,19 @@
12          dist_parislondon = geographical(paris, london, in_radians=False)
13 
14          self.assertAlmostEqual(dist_parislondon, 341564, 0)
15 
16 
17 +class ExactMatchTestCase(unittest.TestCase):
18 +
19 +    def test_pdist(self):
20 +        processing = ExactMatchProcessing()
21 +        _input = ['Victor Hugo', 'Victo Hugo', 'Victor Hugo']
22 +        pdist = processing.pdist(_input)
23 +        self.assertEqual([1, 0., 1], pdist)
24 +
25 +
26  class LevenshteinTestCase(unittest.TestCase):
27 
28      def setUp(self):
29          self.input1 = [u'Victor Hugo', u'Albert Camus', 'Jean Valjean']
30          self.input2 = [u'Victor Wugo', u'Albert Camus', 'Albert Camu']
diff --git a/utils/distances.py b/utils/distances.py
@@ -115,10 +115,15 @@
31 
32 
33  ###############################################################################
34  ### STRING DISTANCES ##########################################################
35  ###############################################################################
36 +def exact_match(a, b):
37 +    """ The simplest distance, defined as 0 if both values are equal, 1 elsewise.
38 +    """
39 +    return 0 if a==b else 1
40 +
41  def levenshtein(stra, strb, tokenizer=None):
42      """ Compute the Levenshtein distance between stra and strb.
43 
44      The Levenshtein distance is defined as the minimal cost to transform stra
45      into strb, where 3 operators are allowed :
@@ -438,10 +443,21 @@
46 
47 
48  ###############################################################################
49  ### CONCRETE PROCESSINGS #######################################################
50  ###############################################################################
51 +class ExactMatchProcessing(BaseProcessing):
52 +    """ A processing based on the exact match (1 if a==b, 0 elsewise)
53 +    """
54 +
55 +    def __init__(self, ref_attr_index=None, target_attr_index=None,
56 +                 tokenizer=None, weight=1, matrix_normalized=False):
57 +        super(ExactMatchProcessing, self).__init__(ref_attr_index,
58 +                                                   target_attr_index,
59 +                                                   exact_match,
60 +                                                   weight, matrix_normalized)
61 +
62  class LevenshteinProcessing(BaseProcessing):
63      """ A processing based on the levenshtein distance.
64      """
65 
66      def __init__(self, ref_attr_index=None, target_attr_index=None,