C++ Utilities  4.17.0
Useful C++ classes and routines such as argument parser, IO and conversion utilities
levenshtein.cpp
Go to the documentation of this file.
1 #include "./levenshtein.h"
2 #include "./multiarray.h"
3 
4 #include "../math/math.h"
5 
6 #include <iostream>
7 #include <limits>
8 #include <memory>
9 
10 using namespace std;
11 
17 namespace MiscUtilities {
18 
20 
22 using DistanceArray = MultiArray<size_t, NoneOwningMultiArray, size_t, size_t>;
23 
36 void initDistanceArray(DistanceArray &distanceArray, const size_t size1, const size_t size2)
37 {
38  const auto maxDistance(size1 + size2);
39  distanceArray.at(0, 0) = maxDistance;
40  for (size_t i = 0; i <= size1; ++i) {
41  distanceArray.at(i + 1, 1) = i;
42  distanceArray.at(i + 1, 0) = maxDistance;
43  }
44  for (size_t i = 0; i <= size2; ++i) {
45  distanceArray.at(1, i + 1) = i;
46  distanceArray.at(0, i + 1) = maxDistance;
47  }
48 }
49 
52 size_t performDamerauLevenshteinAlgorithm(
53  DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
54 {
55  size_t dist1[std::numeric_limits<unsigned char>::max() + 1] = { 0 };
56  for (size_t index1 = 1; index1 <= size1; ++index1) {
57  size_t dist2 = 0;
58  for (size_t index2 = 1; index2 <= size2; ++index2) {
59  const size_t substitution((str1[index1 - 1] == str2[index2 - 1]) ? 0 : 1);
60  const size_t transposition1(dist1[static_cast<unsigned char>(str2[index2 - 1])]);
61  const size_t transposition2(dist2);
62  if (!substitution) {
63  dist2 = index2;
64  }
65  // clang-format off
66  distanceArray.at(index1 + 1, index2 + 1) = MathUtilities::min(
67  distanceArray.at(index1, index2) + substitution, // substitution
68  distanceArray.at(index1 + 1, index2) + 1, // insertion
69  distanceArray.at(index1, index2 + 1) + 1, // deletion
70  distanceArray.at(transposition1, transposition2) + (index1 - transposition1 - 1) + 1 + (index2 - transposition2 - 1) // transposition
71  );
72  // clang-format on
73  }
74  dist1[static_cast<int>(str1[index1 - 1])] = index1;
75  }
76  return distanceArray.at(size1 + 1, size2 + 1);
77 }
78 
80 template <typename DistanceArray>
81 size_t performDamerauLevenshteinAlgorithmAllocatingOnHeap(
82  DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
83 {
84  std::vector<size_t> buffer(distanceArray.totalSize());
85  distanceArray.buffer() = buffer.data();
86  initDistanceArray(distanceArray, size1, size2);
87  return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2);
88 }
89 
92 template <typename DistanceArray>
93 size_t performDamerauLevenshteinAlgorithmAllocatingOnStack(
94  DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
95 {
96  size_t buffer[128] = { 0 };
97  distanceArray.buffer() = buffer;
98  initDistanceArray(distanceArray, size1, size2);
99  return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2);
100 }
101 
103 
122 std::size_t computeDamerauLevenshteinDistance(const char *const str1, const size_t size1, const char *const str2, const size_t size2)
123 {
124  // allocate distance array
125  auto distanceArray(makeNoneOwningMultiArray<std::size_t>(size1 + 2, size2 + 2));
126  if (distanceArray.totalSize() <= 128) {
127  return performDamerauLevenshteinAlgorithmAllocatingOnStack(distanceArray, str1, size1, str2, size2);
128  } else {
129  return performDamerauLevenshteinAlgorithmAllocatingOnHeap(distanceArray, str1, size1, str2, size2);
130  }
131 }
132 
133 } // namespace MiscUtilities
constexpr T max(T first, T second)
Returns the greatest of the given items.
Definition: math.h:29
constexpr int i
constexpr T min(T first, T second)
Returns the smallest of the given items.
Definition: math.h:17
std::size_t computeDamerauLevenshteinDistance(const char *const str1, const size_t size1, const char *const str2, const size_t size2)
Computes Damerau–Levenshtein distance with adjacent transpositions.
Contains various utilities such as computing Damerau–Levenshtein distance and N-dimensional arrays.
Definition: multiarray.h:8