3
0
Files
corteza/server/pkg/str/soundex.go
Vivek Patel e7fa07cfae Extends DeDup capabilities
Support modifier for value search and allow ability to select matching criteria for multi value field

- Removed name from rule for now
- Value modifier to search with are ignore-case, case-sensitive, fuzzy-search, sounds-like
- Multi value matching criteria are one-of, equal
- Migrate RecordDeDup config for module, by adding upgrade fix for module.config.recordDeDup to migrate as per to the latest DeDupRule struct.
2023-01-25 21:54:36 +05:30

105 lines
3.1 KiB
Go

package str
import (
"strings"
)
// ToSoundex takes a word and returns the soundex code for it.
// https://en.wikipedia.org/wiki/Soundex
//
// 1. Retain the first letter of the name and drop all other occurrences of a, e, i, o, u, y, h, w.
// 2. Replace consonants with digits as follows (after the first letter):
// b, f, p, v → 1
// c, g, j, k, q, s, x, z → 2
// d, t → 3
// l → 4
// m, n → 5
// r → 6
// 3. If two or more letters with the same number are adjacent in the original name (before step 1),
// only retain the first letter; also two letters with the same number separated
// by 'h' or 'w' are coded as a single number, whereas such letters separated by a vowel are coded twice.
// This rule also applies to the first letter.
// 4. Iterate the previous step until you have one letter and three numbers.
// If you have too few letters in your word that you can't assign three numbers, append with zeros
// until there are three numbers. If you have more than 3 letters, just retain the first 3 numbers.
func ToSoundex(s string) string {
var (
// soundex code
code string
// last code
lastCode string
// last rune
lastRune rune
// last rune is vowel
lastRuneIsVowel bool
)
// retain the first letter of the name and drop all other occurrences of a, e, i, o, u, y, h, w
for _, r := range s {
if r == 'a' || r == 'e' || r == 'i' || r == 'o' || r == 'u' || r == 'y' || r == 'h' || r == 'w' {
continue
}
code = string(r)
break
}
// replace consonants with digits as follows (after the first letter)
for _, r := range s {
if r == 'a' || r == 'e' || r == 'i' || r == 'o' || r == 'u' || r == 'y' || r == 'h' || r == 'w' {
lastRuneIsVowel = true
continue
}
if lastRuneIsVowel {
lastRuneIsVowel = false
lastCode = ""
}
switch r {
case 'b', 'f', 'p', 'v':
lastCode = "1"
case 'c', 'g', 'j', 'k', 'q', 's', 'x', 'z':
lastCode = "2"
case 'd', 't':
lastCode = "3"
case 'l':
lastCode = "4"
case 'm', 'n':
lastCode = "5"
case 'r':
lastCode = "6"
}
if lastCode != "" && lastCode != string(lastRune) {
code += lastCode
}
lastRune = r
}
// if two or more letters with the same number are adjacent in the original name (before step 1),
// only retain the first letter
// also two letters with the same number separated by 'h' or 'w' are coded as a single number,
// whereas such letters separated by a vowel are coded twice
// this rule also applies to the first letter
code = strings.ReplaceAll(code, "11", "1")
code = strings.ReplaceAll(code, "22", "2")
code = strings.ReplaceAll(code, "33", "3")
code = strings.ReplaceAll(code, "44", "4")
code = strings.ReplaceAll(code, "55", "5")
code = strings.ReplaceAll(code, "66", "6")
// iterate the previous step until you have one letter and three numbers
// if you have too few letters in your word that you can't assign three numbers,
// append with zeros until there are three numbers
// if you have more than 3 letters, just retain the first 3 numbers
if len(code) < 4 {
code += strings.Repeat("0", 4-len(code))
} else {
code = code[:4]
}
return code
}