Reading a CR2 (Raw Canon Image) header using Python

Question 1

Reading a CR2 (Raw Canon Image) header using Python

python image-processing metadata binary-data

Escualo · Sep 12, 2010 · Viewed 8k times · Source

Answer

Answer

Have you taken into account the header which should (according to the spec) precede the IFD block you're talking about?

I looked through the spec and it says the first IFD block follows the 16 byte header. So if we read bytes 16 and 17 (at offset 0x10 hex) we should get the number of entries in the first IFD block. Then we just have to search through each entry until we find a matching tag id which (as I read it) gives us the byte offset of your date / time string.

This works for me:

from struct import *

def FindDateTimeOffsetFromCR2( buffer, ifd_offset ):
    # Read the number of entries in IFD #0
    (num_of_entries,) = unpack_from('H', buffer, ifd_offset)
    print "ifd #0 contains %d entries"%num_of_entries

    # Work out where the date time is stored
    datetime_offset = -1
    for entry_num in range(0,num_of_entries-1):
        (tag_id, tag_type, num_of_value, value) = unpack_from('HHLL', buffer, ifd_offset+2+entry_num*12)
        if tag_id == 0x0132:
            print "found datetime at offset %d"%value
            datetime_offset = value
    return datetime_offset

if __name__ == '__main__':
    with open("IMG_6113.CR2", "rb") as f:
        buffer = f.read(1024) # read the first 1kb of the file should be enough to find the date / time
        datetime_offset = FindDateTimeOffsetFromCR2(buffer, 0x10)
        print unpack_from(20*'s', buffer, datetime_offset)

Output for my example file is:

ifd #0 contains 14 entries
found datetime at offset 250
('2', '0', '1', '0', ':', '0', '8', ':', '0', '1', ' ', '2', '3', ':', '4', '5', ':', '4', '6', '\x00')

[edit] - a revised / more thorough example

from struct import *

recognised_tags = { 
    0x0100 : 'imageWidth',
    0x0101 : 'imageLength',
    0x0102 : 'bitsPerSample',
    0x0103 : 'compression',
    0x010f : 'make',    
    0x0110 : 'model',
    0x0111 : 'stripOffset',
    0x0112 : 'orientation', 
    0x0117 : 'stripByteCounts',
    0x011a : 'xResolution',
    0x011b : 'yResolution',
    0x0128 : 'resolutionUnit',
    0x0132 : 'dateTime',
    0x8769 : 'EXIF',
    0x8825 : 'GPS data'};

def GetHeaderFromCR2( buffer ):
    # Unpack the header into a tuple
    header = unpack_from('HHLHBBL', buffer)

    print "\nbyte_order = 0x%04X"%header[0]
    print "tiff_magic_word = %d"%header[1]
    print "tiff_offset = 0x%08X"%header[2]
    print "cr2_magic_word = %d"%header[3]
    print "cr2_major_version = %d"%header[4]
    print "cr2_minor_version = %d"%header[5]
    print "raw_ifd_offset = 0x%08X\n"%header[6]

    return header

def FindDateTimeOffsetFromCR2( buffer, ifd_offset, endian_flag ):
    # Read the number of entries in IFD #0
    (num_of_entries,) = unpack_from(endian_flag+'H', buffer, ifd_offset)
    print "Image File Directory #0 contains %d entries\n"%num_of_entries

    # Work out where the date time is stored
    datetime_offset = -1

    # Go through all the entries looking for the datetime field
    print " id  | type |  number  |  value   "
    for entry_num in range(0,num_of_entries):

        # Grab this IFD entry
        (tag_id, tag_type, num_of_value, value) = unpack_from(endian_flag+'HHLL', buffer, ifd_offset+2+entry_num*12)

        # Print out the entry for information
        print "%04X | %04X | %08X | %08X "%(tag_id, tag_type, num_of_value, value),
        if tag_id in recognised_tags:
            print recognised_tags[tag_id]

        # If this is the datetime one we're looking for, make a note of the offset
        if tag_id == 0x0132:
            assert tag_type == 2
            assert num_of_value == 20
            datetime_offset = value

    return datetime_offset

if __name__ == '__main__':
    with open("IMG_6113.CR2", "rb") as f:
        # read the first 1kb of the file should be enough to find the date/time
        buffer = f.read(1024) 

        # Grab the various parts of the header
        (byte_order, tiff_magic_word, tiff_offset, cr2_magic_word, cr2_major_version, cr2_minor_version, raw_ifd_offset) = GetHeaderFromCR2(buffer)

        # Set the endian flag
        endian_flag = '@'
        if byte_order == 0x4D4D:
            # motorola format
            endian_flag = '>'
        elif byte_order == 0x4949:
            # intel format
            endian_flag = '<'

        # Search for the datetime entry offset
        datetime_offset = FindDateTimeOffsetFromCR2(buffer, 0x10, endian_flag)

        datetime_string = unpack_from(20*'s', buffer, datetime_offset)
        print "\nDatetime: "+"".join(datetime_string)+"\n"

Question 2

I'm trying to extract the date/time when a picture was taken from the CR2 (Canon format for raw pictures).

I know the CR2 specification, and I know I can use Python struct module to extract pieces from a binary buffer.

Briefly, the specification says that in Tag 0x0132 / 306 I can find an string of length 20 - the date and time.

I tried to get that tag by using:

struct.unpack_from(20*'s', buffer, 0x0132)

but I get

('\x00', '\x00', "'", '\x88, ...[and more crap])

Any ideas?

Edit

Many thanks for the thorough effort! The answers are phenomenal and I learned a lot about handling binary data.

Reading a CR2 (Raw Canon Image) header using Python

Answer

Related questions