I have a database table that has a column of SQLServer Soundex encoded last name + first name. In my C# program I would like to convert a string using soundex for use in my query.
Is there either a standard string function for soundex in the dotnet library or is the an open source library that implements it (perhaps as an extension method on string)?
I know this is late, but I also needed something similar (though no database involved), and the only answer isn't accurate (fails for 'Tymczak' and 'Pfister').
This is what I came up with:
class Program
{
public static void Main(string[] args)
{
Assert.AreEqual(Soundex.Generate("H"), "H000");
Assert.AreEqual(Soundex.Generate("Robert"), "R163");
Assert.AreEqual(Soundex.Generate("Rupert"), "R163");
Assert.AreEqual(Soundex.Generate("Rubin"), "R150");
Assert.AreEqual(Soundex.Generate("Ashcraft"), "A261");
Assert.AreEqual(Soundex.Generate("Ashcroft"), "A261");
Assert.AreEqual(Soundex.Generate("Tymczak"), "T522");
Assert.AreEqual(Soundex.Generate("Pfister"), "P236");
Assert.AreEqual(Soundex.Generate("Gutierrez"), "G362");
Assert.AreEqual(Soundex.Generate("Jackson"), "J250");
Assert.AreEqual(Soundex.Generate("VanDeusen"), "V532");
Assert.AreEqual(Soundex.Generate("Deusen"), "D250");
Assert.AreEqual(Soundex.Generate("Sword"), "S630");
Assert.AreEqual(Soundex.Generate("Sord"), "S630");
Assert.AreEqual(Soundex.Generate("Log-out"), "L230");
Assert.AreEqual(Soundex.Generate("Logout"), "L230");
Assert.AreEqual(Soundex.Generate("123"), Soundex.Empty);
Assert.AreEqual(Soundex.Generate(""), Soundex.Empty);
Assert.AreEqual(Soundex.Generate(null), Soundex.Empty);
}
}
public static class Soundex
{
public const string Empty = "0000";
private static readonly Regex Sanitiser = new Regex(@"[^A-Z]", RegexOptions.Compiled);
private static readonly Regex CollapseRepeatedNumbers = new Regex(@"(\d)?\1*[WH]*\1*", RegexOptions.Compiled);
private static readonly Regex RemoveVowelSounds = new Regex(@"[AEIOUY]", RegexOptions.Compiled);
public static string Generate(string Phrase)
{
// Remove non-alphas
Phrase = Sanitiser.Replace((Phrase ?? string.Empty).ToUpper(), string.Empty);
// Nothing to soundex, return empty
if (string.IsNullOrEmpty(Phrase))
return Empty;
// Convert consonants to numerical representation
var Numified = Numify(Phrase);
// Remove repeated numberics (characters of the same sound class), even if separated by H or W
Numified = CollapseRepeatedNumbers.Replace(Numified, @"$1");
if (Numified.Length > 0 && Numified[0] == Numify(Phrase[0]))
{
// Remove first numeric as first letter in same class as subsequent letters
Numified = Numified.Substring(1);
}
// Remove vowels
Numified = RemoveVowelSounds.Replace(Numified, string.Empty);
// Concatenate, pad and trim to ensure X### format.
return string.Format("{0}{1}", Phrase[0], Numified).PadRight(4, '0').Substring(0, 4);
}
private static string Numify(string Phrase)
{
return new string(Phrase.ToCharArray().Select(Numify).ToArray());
}
private static char Numify(char Character)
{
switch (Character)
{
case 'B': case 'F': case 'P': case 'V':
return '1';
case 'C': case 'G': case 'J': case 'K': case 'Q': case 'S': case 'X': case 'Z':
return '2';
case 'D': case 'T':
return '3';
case 'L':
return '4';
case 'M': case 'N':
return '5';
case 'R':
return '6';
default:
return Character;
}
}
}