1 Reply Latest reply: Jun 12, 2015 11:21 AM by Mathieu LAURIER RSS

    Fuzzy Matching/Joining

    Marco Wedel

      Hi,

       

      there's been a thread yesterday about joining  two tables with similar text fields that do not exactly match.

       

      This thread was deleted, so I post an example here, maybe someone still cares.

       

      I tried to calculate the levenshtein distance of those two text fields to be able to select corresponding records:

       

      QlikCommunity_Thread_130371_Pic1.JPG.jpg

       

      QlikCommunity_Thread_130371_Pic2.JPG.jpg

       

      QUALIFY *;
      
      Table1:
      LOAD *, RecNo() as RecNo INLINE [
          MARK1, MARK2
          CH: WV2ZZZ7HZ9H119390, ENGINE NO: AXB179634
          CH: WV2ZZZ7HZ9H119548, ENGINE NO: AXB179429
          CHASSIS NO:SHSRE77508U101822, ENG NO:K2424-1019151
          CH: KL1PM5E51AK528096, ENG: F18D4009613KA
          CH: WV2ZZZ7HZ9H100620, ENG NO: AXB178969
          CH: AHTLT58E406009706, ENG: 1NR0178419
          CH: AHTLT58E206008912, ENG: 1NR0162953
          CH: AHTLT58E606008833, ENG: 1NR0162819
      ];
      
      Table2:
      LOAD *, RecNo() as RecNo INLINE [
          MARK1, MARK2
          CHASS# WV2ZZZ7HZ9H119390, E# AXB179634
          C# WV2ZZZ7HZ9H119548, ENG NO: AXB179429
          CH: AHTLT58E206008912, ENG: 1NR0162953
          CH: KL1PM5E51AK528096, ENG: F18D4009613KA
          CHS# WV2ZZZ7HZ9H100620, ENG#: AXB178969
          CH: AHTLT58E406009706, ENG: 1NR0178419
          VIN NO:SHSRE77508U101822, ENG:K2424-1019151
          CH: AHTLT58E606008833, ENG: 1NR0162819
      ];
      
      UNQUALIFY *;
      
      NoConcatenate
      
      tabLinks:
      LOAD * Resident Table1;
      Join (tabLinks)
      LOAD * Resident Table2;
      Left Join (tabLinks)
      LOAD Table1.RecNo,
           Table2.RecNo,
           levenshtein(Table1.MARK1,Table2.MARK1) as LevDistMARK1,
           levenshtein(Table1.MARK2,Table2.MARK2) as LevDistMARK2
      Resident tabLinks;
      
      DROP Fields Table1.MARK1, Table2.MARK1, Table1.MARK2, Table2.MARK2 From tabLinks;
      

       

      using this module / vbs-function:

       

      '********************************************************************************************************
      '*
      '*  VBScript function implementing Levenshtein-Distance-algorithm
      '*  source: http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#VBScript
      '*
      '********************************************************************************************************
      
      
      Function levenshtein( a, b )
        Dim i,j,cost,d,min1,min2,min3
      
       ' Avoid calculations where there there are empty words
        If Len( a ) = 0 Then levenshtein = Len( b ): Exit Function
        If Len( b ) = 0 Then levenshtein = Len( a ): Exit Function
      
       ' Array initialization
        ReDim d( Len( a ), Len( b ) )
      
        For i = 0 To Len( a ): d( i, 0 ) = i: Next
        For j = 0 To Len( b ): d( 0, j ) = j: Next
      
       ' Actual calculation
        For i = 1 To Len( a )
        For j = 1 To Len( b )
                              If Mid(a, i, 1) = Mid(b, j, 1) Then cost = 0 Else cost = 1 End If
      
        ' Since min() function is not a part of VBScript, we'll "emulate" it below
        min1 = ( d( i - 1, j ) + 1 )
        min2 = ( d( i, j - 1 ) + 1 )
        min3 = ( d( i - 1, j - 1 ) + cost )
      
        If min1 <= min2 And min1 <= min3 Then
        d( i, j ) = min1
        ElseIf min2 <= min1 And min2 <= min3 Then
        d( i, j ) = min2
        Else
        d( i, j ) = min3
        End If
        Next
        Next
      
        levenshtein = d( Len( a ), Len( b ) )
      End Function
      

       

      hope this helps

       

      regards

       

      Marco