[xiph-cvs] cvs commit: positron/positron MP3Info.py

Wed Jul 2 10:57:30 PDT 2003

volsung     03/07/02 13:57:30

  Modified:    positron MP3Info.py
  Log:
  More fixes to hopefully improve MP3 detection:
  
  * Check a random location in the middle of the file for a frame header
  
  * If a bad header is found during a linear search, keep going
  
  * ID3v2 frame lengths (not the length of the whole ID3v2 tag) are sometimes
  sync-safe integers and sometimes not depending on minor version of the v2
  tags.  Patch from Alec Mitchell <apm13 at columbia.edu> fixes this.  Hopefully
  closes bug 367 and 377.

Revision  Changes    Path
1.9       +81 -23    positron/positron/MP3Info.py

Index: MP3Info.py
===================================================================
RCS file: /usr/local/cvsroot/positron/positron/MP3Info.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9

--- MP3Info.py	19 Jun 2003 19:08:44 -0000	1.8
+++ MP3Info.py	2 Jul 2003 17:57:30 -0000	1.9
@@ -31,6 +31,7 @@
 
 import struct
 import string
+import random
 
 def _from_synch_safe(synchsafe):
     if isinstance(synchsafe, type(1)):
@@ -94,8 +95,13 @@
         size = ()
         if version == 2:
             size = struct.unpack('!3b', file.read(3))
-        elif version == 3 or version == 4:
+            self.size = (size[0] * 256 + size[1]) * 256 + size[2]
+        elif version == 3:
+            size = struct.unpack('!L', file.read(4))
+            self.size = size[0]
+        elif version == 4:
             size = struct.unpack('!4b', file.read(4))
+            self.size = _from_synch_safe(size)
 
         if version == 3:  # abc00000 def00000
             (flags,) = struct.unpack('!1b', file.read(1))
@@ -118,7 +124,6 @@
             self.f_unsynchronization       = flags >> 1 & 1 #n
             self.f_data_length_indicator   = flags >> 0 & 1 #p
 
-        self.size = _from_synch_safe(size)
         self.data = _strip_zero(file.read(self.size))
 
 _genres = [
@@ -269,7 +274,7 @@
 
 _emphases = [ "none", "50/15 ms", "reserved", "CCIT J.17" ]
 
-_MP3_HEADER_SEEK_LIMIT = 4096
+_MP3_HEADER_SEEK_LIMIT = 500000
 
 class MPEG:
     def __init__(self, file, seeklimit=_MP3_HEADER_SEEK_LIMIT, seekstart=0):
@@ -292,49 +297,103 @@
         self.emphasis = ""
         self.length = 0
 
+
+        # First do a check to see if this is really an MPEG file.
+        #
+        # The longest possible frame for any MPEG audio file
+        # is 4609 bytes for a MPEG 2, Layer 1 256 kbps, 8000Hz with
+        # a padding slot.  Add an extra 4 bytes to ensure we get the
+        # next header and round up to a multiple of 4 to get the magic
+        # number 4616.  If this is an MPEG file, then from a random
+        # point in the middle (far away from the tag stupidity), we
+        # should always find an MPEG frame header in any 4616 byte
+        # substring.
+        #
+        # We pick a location in the middle 50% of the file to
+        # do a header test.  If it passes, then we proceed with parsing
+        # (using much less restrictive searching)
+        test_pos = int(random.uniform(0.25,0.75) * self.filesize)
+
+        offset, header = self._find_header(file, seeklimit=4616,
+                                           seekstart=test_pos)
+        if offset == -1 or header is None:
+            raise Error("Failed MPEG frame test.")
+            
+        # Now we can look for the first header
         offset, header = self._find_header(file, seeklimit, seekstart)
         if offset == -1 or header is None:
             raise Error("Could not find MPEG header")
 
-        self._parse_header(header)
-        ### offset + framelength will find another header. verify??
+        # Note that _find_header already parsed the header
+        
         if not self.valid:
             raise Error("MPEG header not valid")
 
         self._parse_xing(file, seeklimit, seekstart)
-        
-
+    
     def _find_header(self, file, seeklimit=_MP3_HEADER_SEEK_LIMIT,
-                     seekstart=0):
-        file.seek(seekstart, 0)
-        header = file.read(4) # see if we get lucky with the first four bytes
+                     seekstart=0, check_next_header=True):
+        amt = 5120  # Multiple of 512 is hopefully more efficient to read from
+                    # disk, and size ensure the random test will only
+                    # read once
         curr_pos = 0
-        amt = 1024
+        read_more = False
+
+        file.seek(seekstart, 0)
+        header = file.read(amt)
         
-        while len(header) <= seeklimit:
-            
+        while curr_pos <= seeklimit:            
             # look for the sync byte
             offset = string.find(header, chr(255), curr_pos)
+            #print curr_pos + seekstart
             if offset == -1:
                 curr_pos = len(header)  # Header after everything so far
+                read_more = True
             elif offset + 4 > len(header):
                 curr_pos = offset  # Need to read more, jump back here later
+                read_more = True
             elif ord(header[offset+1]) & 0xE0 == 0xE0:
-                return seekstart+offset, header[offset:offset+4]
+
+                # Finish now if we should not check the next header
+                if not check_next_header:
+                    return seekstart+offset, header[offset:offset+4]
+
+                # We have a possible winner, test parse this header and
+                # check if the next header is in the right place.
+                # WARNING: _parse_header has side effects!  This should
+                # be fixed, though in this case it does not matter.
+                self._parse_header(header[offset:offset+4])
+                    
+                if self.valid:
+                    next_off, next_header = \
+                              self._find_header(file, seeklimit=0,
+                                                seekstart=seekstart+offset
+                                                        +self.framelength,
+                                                check_next_header=False)
+                    if next_off != -1:
+                        return seekstart+offset, header[offset:offset+4]
+                    else:
+                        curr_pos = offset+2
+                else:
+                    curr_pos = offset+2
+                    
             else:
                 curr_pos = offset+2 # Gotta be after the 2 bytes we looked at
 
-            chunk = file.read(amt)  # Read bigger chunks
-            header += chunk
-
-            if len(chunk) == 0:
-                # no more to read, give up
-                return -1, None
+            if read_more and curr_pos <= seeklimit:
+                chunk = file.read(amt)
+                if len(chunk) == 0:
+                    # no more to read, give up
+                    return -1, None
+                else:
+                    header += chunk
         
         # couldn't find the header
         return -1, None
 
     def _parse_header(self, header):
+        self.valid = 0 # Assume the worst until proven otherwise
+        
         # AAAAAAAA AAABBCCD EEEEFFGH IIJJKLMM
         (bytes,) = struct.unpack('>i', header)
         mpeg_version =    (bytes >> 19) & 3  # BB   00 = MPEG2.5, 01 = res, 10 = MPEG2, 11 = MPEG1  
@@ -450,9 +509,8 @@
             self.id3 = id3v2
 
         if id3v2.valid:
-            # We'll be generous for files with ID3v2 tags.
-            self.mpeg = MPEG(file, seekstart=id3v2.header_size,
-                             seeklimit=10*_MP3_HEADER_SEEK_LIMIT)
+            # ID3v2 size (header_size) doesn't include 10 bytes of header
+            self.mpeg = MPEG(file, seekstart=id3v2.header_size+10)
         else:
             # Header better be near the beginning if there is no ID3v2
             self.mpeg = MPEG(file)

<p><p>--- >8 ----
List archives:  http://www.xiph.org/archives/
Ogg project homepage: http://www.xiph.org/ogg/
To unsubscribe from this list, send a message to 'cvs-request at xiph.org'
containing only the word 'unsubscribe' in the body.  No subject is needed.
Unsubscribe messages sent to the list will be ignored/filtered.