mq-lang 0.5.13

Core language implementation for mq query language
Documentation
# Fuzzy Match Implementation in mq

def _levenshtein_distance(s1, s2):
  let len1 = len(s1)
  | let len2 = len(s2)
  | let matrix = []
  | var i = 0
  | let matrix = while (i <= len1):
      var j = 0
      | let row = []
      | let row = while (j <= len2):
              let value = if (i == 0):
                j
              elif (j == 0):
                i
              else:
                0
              | let row = row + [value]
              | j += 1
              | row
            end
      | let matrix = matrix + [row]
      | i += 1
      | matrix
    end
  | i = 1
  | let matrix = while (i <= len1):
      var j = 1
      | let matrix = while (j <= len2):
              let cost = if (s1[i - 1] == s2[j - 1]): 0 else: 1
              | let deletion = matrix[i - 1][j] + 1
              | let insertion = matrix[i][j - 1] + 1
              | let substitution = matrix[i - 1][j - 1] + cost
              | let min_value = if (deletion <= insertion && deletion <= substitution):
                              deletion
                            elif (insertion <= substitution):
                              insertion
                            else:
                              substitution
              | let row = matrix[i]
              | let row = set(row, j, min_value)
              | let matrix = set(matrix, i, row)
              | j += 1
              | matrix
            end
      | i += 1
      | matrix
    end
  | if (len1 == 0): len2 elif (len2 == 0): len1 elif (is_none(matrix)): 0 else: matrix[len1][len2]
end

def _calculate_jaro_distance(s1, s2, len1, len2):
  let match_window = floor(if (len1 > len2): (len1 / 2) - 1 else: (len2 / 2) - 1)
  | let match_window = if (match_window < 0): 0 else: match_window
  | let s1_matches = []
  | let s2_matches = []
  | let matches = 0
  | var i = 0
  | let result = while (i < len1):
      let start = max(i - match_window, 0)
      | let end_ = min(len2, i + match_window + 1)
      | var j = start
      | var found = false
      | let result = while (j < end_ && !found):
              let result = if (s1[i] == s2[j] && !contains(s2_matches, j)):
                [true, j]
              else:
                [false, j + 1]
              | found = result[0]
              | j = result[1]
              | let result = if (found):
                              [s1_matches + [i], s2_matches + [j], matches + 1]
                            else:
                              [s1_matches, s2_matches, matches]
              | let s1_matches = result[0]
              | let s2_matches = result[1]
              | let matches = result[2]
              | result
            end
      | let s1_matches = result[0]
      | let s2_matches = result[1]
      | let matches = result[2]
      | i += 1
      | [s1_matches, s2_matches, matches]
    end
  | let s1_matches = result[0]
  | let s2_matches = result[1]
  | let matches = result[2]
  | if (is_none(matches) || matches == 0):
      0
    else:
      _calculate_jaro_with_transpositions(s1, s2, s1_matches, s2_matches, matches, len1, len2)
end

def _calculate_jaro_with_transpositions(s1, s2, s1_matches, s2_matches, matches, len1, len2):
  let transpositions = 0
  | var i = 0
  | let s1_matches_len = len(s1_matches)
  | let transpositions = while (i < s1_matches_len):
      let transpositions = if (s1_matches[i] != s2_matches[i]):
        transpositions + 1
      else:
        transpositions
      | i += 1
      | transpositions
    end
  | let transpositions = transpositions / 2
  | ((matches / len1) + (matches / len2) + ((matches - transpositions) / matches)) / 3
end

def _jaro_distance(s1, s2):
  let len1 = len(s1)
  | let len2 = len(s2)
  | if (len1 == 0 && len2 == 0):
      1
    elif (len1 == 0 || len2 == 0):
      0
    else:
      _calculate_jaro_distance(s1, s2, len1, len2)
end

def _common_prefix_length(s1, s2):
  let min_len = if (len(s1) < len(s2)): len(s1) else: len(s2)
  | var i = 0
  | while (i < min_len && s1[i] == s2[i]):
      i += 1
    end
  | i
end

def _fuzzy_match_with_algorithm(query, candidates, algorithm):
  map(candidates, fn(candidate):
    {"text": candidate, "score": algorithm(query, candidate)}
  end)
end

# Calculates the Levenshtein distance between two strings.
def levenshtein(s1, s2):
  _levenshtein_distance(s1, s2)
end

# Calculates the Jaro distance between two strings (0.0 to 1.0, where 1.0 is exact match).
def jaro(s1, s2):
  _jaro_distance(s1, s2)
end

# Calculates the Jaro-Winkler distance between two strings.
def jaro_winkler(s1, s2):
  let jaro_dist = _jaro_distance(s1, s2)
  | let prefix_length = _common_prefix_length(s1, s2)
  | let prefix_length = if (prefix_length > 4): 4 else: prefix_length
  | jaro_dist + (0.1 * to_number(prefix_length) * (1 - jaro_dist))
end

# Performs fuzzy matching on an array of strings using Jaro-Winkler distance.
def fuzzy_match(candidates, query):
  let candidates = if (is_array(candidates)):
    candidates
  else:
    [candidates]
  | _fuzzy_match_with_algorithm(query, candidates, jaro_winkler)
  | sort_by(fn(item): 1 - item["score"];)
end

# Performs fuzzy matching using Levenshtein distance.
def fuzzy_match_levenshtein(candidates, query):
  let candidates = if (is_array(candidates)):
    candidates
  else:
    [candidates]
  | _fuzzy_match_with_algorithm(query, candidates, levenshtein)
  | sort_by(fn(item): item["score"];)
end

# Performs fuzzy matching using Jaro distance.
def fuzzy_match_jaro(candidates, query):
  _fuzzy_match_with_algorithm(query, candidates, jaro)
  | sort_by(fn(item): 1 - item["score"];)
end

# Filters candidates by minimum fuzzy match score using Jaro-Winkler.
def fuzzy_filter(candidates, query, threshold):
  let matches = fuzzy_match(candidates, query)
  | filter(matches, fn(m): m["score"] >= threshold;)
end

# Finds the best fuzzy match from candidates.
def fuzzy_best_match(candidates, query):
  let matches = fuzzy_match(candidates, query)
  | if (is_empty(matches)):
      None
    else:
      first(matches)
end