DotNet Soundex Function

automatic picture automatic · Jun 20, 2012 · Viewed 8.6k times · Source

I have a database table that has a column of SQLServer Soundex encoded last name + first name. In my C# program I would like to convert a string using soundex for use in my query.

Is there either a standard string function for soundex in the dotnet library or is the an open source library that implements it (perhaps as an extension method on string)?

Answer

Daniel Flint picture Daniel Flint · Sep 24, 2015

I know this is late, but I also needed something similar (though no database involved), and the only answer isn't accurate (fails for 'Tymczak' and 'Pfister').

This is what I came up with:

class Program
{
    public static void Main(string[] args)
    {
                Assert.AreEqual(Soundex.Generate("H"), "H000");
                Assert.AreEqual(Soundex.Generate("Robert"), "R163");
                Assert.AreEqual(Soundex.Generate("Rupert"), "R163");
                Assert.AreEqual(Soundex.Generate("Rubin"), "R150");
                Assert.AreEqual(Soundex.Generate("Ashcraft"), "A261");
                Assert.AreEqual(Soundex.Generate("Ashcroft"), "A261");
                Assert.AreEqual(Soundex.Generate("Tymczak"), "T522");
                Assert.AreEqual(Soundex.Generate("Pfister"), "P236");
                Assert.AreEqual(Soundex.Generate("Gutierrez"), "G362");
                Assert.AreEqual(Soundex.Generate("Jackson"), "J250");
                Assert.AreEqual(Soundex.Generate("VanDeusen"), "V532");
                Assert.AreEqual(Soundex.Generate("Deusen"), "D250");
                Assert.AreEqual(Soundex.Generate("Sword"), "S630");
                Assert.AreEqual(Soundex.Generate("Sord"), "S630");
                Assert.AreEqual(Soundex.Generate("Log-out"), "L230");
                Assert.AreEqual(Soundex.Generate("Logout"), "L230");
                Assert.AreEqual(Soundex.Generate("123"), Soundex.Empty);
                Assert.AreEqual(Soundex.Generate(""), Soundex.Empty);
                Assert.AreEqual(Soundex.Generate(null), Soundex.Empty);
    }
}

public static class Soundex
{
    public const string Empty = "0000";

    private static readonly Regex Sanitiser = new Regex(@"[^A-Z]", RegexOptions.Compiled);
    private static readonly Regex CollapseRepeatedNumbers = new Regex(@"(\d)?\1*[WH]*\1*", RegexOptions.Compiled);
    private static readonly Regex RemoveVowelSounds = new Regex(@"[AEIOUY]", RegexOptions.Compiled);

    public static string Generate(string Phrase)
    {
        // Remove non-alphas
        Phrase = Sanitiser.Replace((Phrase ?? string.Empty).ToUpper(), string.Empty);

        // Nothing to soundex, return empty
        if (string.IsNullOrEmpty(Phrase))
            return Empty;

        // Convert consonants to numerical representation
        var Numified = Numify(Phrase);

        // Remove repeated numberics (characters of the same sound class), even if separated by H or W
        Numified = CollapseRepeatedNumbers.Replace(Numified, @"$1");

        if (Numified.Length > 0 && Numified[0] == Numify(Phrase[0]))
        {
            // Remove first numeric as first letter in same class as subsequent letters
            Numified = Numified.Substring(1);
        }

        // Remove vowels
        Numified = RemoveVowelSounds.Replace(Numified, string.Empty);

        // Concatenate, pad and trim to ensure X### format.
        return string.Format("{0}{1}", Phrase[0], Numified).PadRight(4, '0').Substring(0, 4);
    }

    private static string Numify(string Phrase)
    {
        return new string(Phrase.ToCharArray().Select(Numify).ToArray());
    }

    private static char Numify(char Character)
    {
        switch (Character)
        {
            case 'B': case 'F': case 'P': case 'V':
                return '1';
            case 'C': case 'G': case 'J': case 'K': case 'Q': case 'S': case 'X': case 'Z':
                return '2';
            case 'D': case 'T':
                return '3';
            case 'L':
                return '4';
            case 'M': case 'N':
                return '5';
            case 'R':
                return '6';
            default:
                return Character;
        }
    }
}