How to implement and do OCR in a C# project?

Berker Yüceer picture Berker Yüceer · Jun 8, 2012 · Viewed 110.4k times · Source

I ve been searching for a while and all that i ve seen some OCR library requests. I would like to know how to implement the purest, easy to install and use OCR library with detailed info for installation into a C# project.

If posible, I just wanna implement it like a usual dll reference...

Example:

using org.pdfbox.pdmodel;
using org.pdfbox.util;

Also a little OCR code example would be nice, such as:

public string OCRFromBitmap(Bitmap Bmp)
{
    Bmp.Save(temppath, System.Drawing.Imaging.ImageFormat.Tiff);
    string OcrResult = Analyze(temppath);
    File.Delete(temppath);
    return OcrResult;
}

So please consider that I'm not familiar to OCR projects and give me an answer like talking to a dummy.

Edit: I guess people misunderstood my request. I wanted to know how to implement those open source OCR libraries to a C# project and how to use them. The link given as dup is not giving answers that I requested at all.

Answer

B.K. picture B.K. · Dec 9, 2014

If anyone is looking into this, I've been trying different options and the following approach yields very good results. The following are the steps to get a working example:

  1. Add .NET Wrapper for tesseract to your project. It can be added via NuGet package Install-Package Tesseract(https://github.com/charlesw/tesseract).
  2. Go to the Downloads section of the official Tesseract project (https://code.google.com/p/tesseract-ocr/ EDIT: It's now located here: https://github.com/tesseract-ocr/langdata).
  3. Download the preferred language data, example: tesseract-ocr-3.02.eng.tar.gz English language data for Tesseract 3.02.
  4. Create tessdata directory in your project and place the language data files in it.
  5. Go to Properties of the newly added files and set them to copy on build.
  6. Add a reference to System.Drawing.
  7. From .NET Wrapper repository, in the Samples directory copy the sample phototest.tif file into your project directory and set it to copy on build.
  8. Create the following two files in your project (just to get started):

Program.cs

using System;
using Tesseract;
using System.Diagnostics;

namespace ConsoleApplication
{
    class Program
    {
        public static void Main(string[] args)
        {
            var testImagePath = "./phototest.tif";
            if (args.Length > 0)
            {
                testImagePath = args[0];
            }

            try
            {
                var logger = new FormattedConsoleLogger();
                var resultPrinter = new ResultPrinter(logger);
                using (var engine = new TesseractEngine(@"./tessdata", "eng", EngineMode.Default))
                {
                    using (var img = Pix.LoadFromFile(testImagePath))
                    {
                        using (logger.Begin("Process image"))
                        {
                            var i = 1;
                            using (var page = engine.Process(img))
                            {
                                var text = page.GetText();
                                logger.Log("Text: {0}", text);
                                logger.Log("Mean confidence: {0}", page.GetMeanConfidence());

                                using (var iter = page.GetIterator())
                                {
                                    iter.Begin();
                                    do
                                    {
                                        if (i % 2 == 0)
                                        {
                                            using (logger.Begin("Line {0}", i))
                                            {
                                                do
                                                {
                                                    using (logger.Begin("Word Iteration"))
                                                    {
                                                        if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
                                                        {
                                                            logger.Log("New block");
                                                        }
                                                        if (iter.IsAtBeginningOf(PageIteratorLevel.Para))
                                                        {
                                                            logger.Log("New paragraph");
                                                        }
                                                        if (iter.IsAtBeginningOf(PageIteratorLevel.TextLine))
                                                        {
                                                            logger.Log("New line");
                                                        }
                                                        logger.Log("word: " + iter.GetText(PageIteratorLevel.Word));
                                                    }
                                                } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
                                            }
                                        }
                                        i++;
                                    } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                }
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                Trace.TraceError(e.ToString());
                Console.WriteLine("Unexpected Error: " + e.Message);
                Console.WriteLine("Details: ");
                Console.WriteLine(e.ToString());
            }
            Console.Write("Press any key to continue . . . ");
            Console.ReadKey(true);
        }



        private class ResultPrinter
        {
            readonly FormattedConsoleLogger logger;

            public ResultPrinter(FormattedConsoleLogger logger)
            {
                this.logger = logger;
            }

            public void Print(ResultIterator iter)
            {
                logger.Log("Is beginning of block: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Block));
                logger.Log("Is beginning of para: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Para));
                logger.Log("Is beginning of text line: {0}", iter.IsAtBeginningOf(PageIteratorLevel.TextLine));
                logger.Log("Is beginning of word: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Word));
                logger.Log("Is beginning of symbol: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Symbol));

                logger.Log("Block text: \"{0}\"", iter.GetText(PageIteratorLevel.Block));
                logger.Log("Para text: \"{0}\"", iter.GetText(PageIteratorLevel.Para));
                logger.Log("TextLine text: \"{0}\"", iter.GetText(PageIteratorLevel.TextLine));
                logger.Log("Word text: \"{0}\"", iter.GetText(PageIteratorLevel.Word));
                logger.Log("Symbol text: \"{0}\"", iter.GetText(PageIteratorLevel.Symbol));
            }
        }
    }
}

FormattedConsoleLogger.cs

using System;
using System.Collections.Generic;
using System.Text;
using Tesseract;

namespace ConsoleApplication
{
    public class FormattedConsoleLogger
    {
        const string Tab = "    ";
        private class Scope : DisposableBase
        {
            private int indentLevel;
            private string indent;
            private FormattedConsoleLogger container;

            public Scope(FormattedConsoleLogger container, int indentLevel)
            {
                this.container = container;
                this.indentLevel = indentLevel;
                StringBuilder indent = new StringBuilder();
                for (int i = 0; i < indentLevel; i++)
                {
                    indent.Append(Tab);
                }
                this.indent = indent.ToString();
            }

            public void Log(string format, object[] args)
            {
                var message = String.Format(format, args);
                StringBuilder indentedMessage = new StringBuilder(message.Length + indent.Length * 10);
                int i = 0;
                bool isNewLine = true;
                while (i < message.Length)
                {
                    if (message.Length > i && message[i] == '\r' && message[i + 1] == '\n')
                    {
                        indentedMessage.AppendLine();
                        isNewLine = true;
                        i += 2;
                    }
                    else if (message[i] == '\r' || message[i] == '\n')
                    {
                        indentedMessage.AppendLine();
                        isNewLine = true;
                        i++;
                    }
                    else
                    {
                        if (isNewLine)
                        {
                            indentedMessage.Append(indent);
                            isNewLine = false;
                        }
                        indentedMessage.Append(message[i]);
                        i++;
                    }
                }

                Console.WriteLine(indentedMessage.ToString());

            }

            public Scope Begin()
            {
                return new Scope(container, indentLevel + 1);
            }

            protected override void Dispose(bool disposing)
            {
                if (disposing)
                {
                    var scope = container.scopes.Pop();
                    if (scope != this)
                    {
                        throw new InvalidOperationException("Format scope removed out of order.");
                    }
                }
            }
        }

        private Stack<Scope> scopes = new Stack<Scope>();

        public IDisposable Begin(string title = "", params object[] args)
        {
            Log(title, args);
            Scope scope;
            if (scopes.Count == 0)
            {
                scope = new Scope(this, 1);
            }
            else
            {
                scope = ActiveScope.Begin();
            }
            scopes.Push(scope);
            return scope;
        }

        public void Log(string format, params object[] args)
        {
            if (scopes.Count > 0)
            {
                ActiveScope.Log(format, args);
            }
            else
            {
                Console.WriteLine(String.Format(format, args));
            }
        }

        private Scope ActiveScope
        {
            get
            {
                var top = scopes.Peek();
                if (top == null) throw new InvalidOperationException("No current scope");
                return top;
            }
        }
    }
}