Are the compressed bytes inside GZIP and PKZIP files compatible? - gzip

This question is a follow-up to "How are zlib, gzip and zip related? What do they have in common and how are they different?" The answers are very detailed but they never quite answer my specific question.
Given a valid GZIP file, should I always be able to extract the deflate-bytes inside and use those bytes to construct a valid PKZIP file with the same contents, without decompressing and recompressing that byte stream?
For example, imagine I have a collection of GZIP files. Could I write a program that quickly (by avoiding deflate/inflate) constructs an equivalent PKZIP file of those files by cutting the GZIP headers off the source files and building a PKZIP structure around the byte streams? (Also the same in reverse by taking any valid PKZIP file and quickly convert them into many GZIP files?)
Both file formats appear to use the same "deflate" algorithm, but is it exactly the same deflate algorithm?

Yes. It is exactly the same deflate format.
(The deflate algorithm can be, and in fact often is different, producing different deflate streams. However that is irrelevant to your application. The format is compatible, and any compliant inflator will be able to decompress the gzip deflate data transplanted into a zip file.)
I forgot why I wrote this, but the C code below will convert a gzip file to a single-entry zip file, with some constraints on the gzip file.
/*
gz2zip.c version 1.0, 31 July 2018
Copyright (C) 2018 Mark Adler
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Mark Adler
madler#alumni.caltech.edu
*/
// Convert gzip (.gz) file to a single entry zip file. See the comments before
// gz2zip() for more details and caveats.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
# include <fcntl.h>
# include <io.h>
# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
#else
# define SET_BINARY_MODE(file)
#endif
#define local static
// Exit on error.
local void bail(char *why) {
fprintf(stderr, "gz2zip abort: %s\n", why);
exit(1);
}
// Type to track number of bytes written.
typedef struct {
FILE *out;
off_t off;
} tally_t;
// Write len bytes at dat to t.
local void put(tally_t *t, void const *dat, size_t len) {
size_t ret = fwrite(dat, 1, len, t->out);
if (ret != len)
bail("write error");
t->off += len;
}
// Write 16-bit integer n in little-endian order to t.
local void put2(tally_t *t, unsigned n) {
unsigned char dat[2];
dat[0] = n;
dat[1] = n >> 8;
put(t, dat, 2);
}
// Write 32-bit integer n in little-endian order to t.
local void put4(tally_t *t, unsigned long n) {
put2(t, n);
put2(t, n >> 16);
}
// Write n zeros to t.
local void putz(tally_t *t, unsigned n) {
unsigned char const buf[1] = {0};
while (n--)
put(t, buf, 1);
}
// Convert the Unix time unix to DOS time in the four bytes at *dos. If there
// is a conversion error for any reason, store the current time in DOS format
// at *dos. The Unix time in seconds is rounded up to an even number of
// seconds, since the DOS time can only represent even seconds. If the Unix
// time is before 1980, the minimum DOS time of Jan 1, 1980 is used.
local void unix2dos(unsigned char *dos, time_t unix) {
unix += unix & 1;
struct tm *s = localtime(&unix);
if (s == NULL) {
unix = time(NULL); // on error, use current time
unix += unix & 1;
s = localtime(&unix);
if (s == NULL)
bail("internal error"); // shouldn't happen
}
if (s->tm_year < 80) { // no DOS time before 1980
dos[0] = 0; dos[1] = 0; // use midnight,
dos[2] = (1 << 5) + 1; dos[3] = 0; // Jan 1, 1980
}
else {
dos[0] = (s->tm_min << 5) + (s->tm_sec >> 1);
dos[1] = (s->tm_hour << 3) + (s->tm_min >> 3);
dos[2] = ((s->tm_mon + 1) << 5) + s->tm_mday;
dos[3] = ((s->tm_year - 80) << 1) + ((s->tm_mon + 1) >> 3);
}
}
// Chunk size for reading and writing raw deflate data.
#define CHUNK 16384
// Read the gzip file from in and write it as a single-entry zip file to out.
// This assumes that the gzip file has a single member, that it has no junk
// after the gzip trailer, and that it contains less than 4GB of uncompressed
// data. The gzip file is not decompressed or validated, other than checking
// for the proper header format. The modification time from the gzip header is
// used for the zip entry, unless it is not present, in which case the current
// local time is used for the zip entry. The file name from the gzip header is
// used for the zip entry, unless it is not present, in which case "-" is used.
// This does not use the Zip64 format, so the offsets in the resulting zip file
// must be less than 4GB. If name is not NULL, then the zero-terminated string
// at name is used as the file name for the single entry. Whether the file name
// comes from the gzip header or from name, it is truncated to 64K-1 characters
// if necessary.
//
// It is recommended that unzip -t be used on the resulting file to verify its
// integrity. If the gzip files do not obey the constraints above, then the zip
// file will not be valid.
local void gz2zip(FILE *in, FILE *out, char *name) {
// zip file constant headers for local, central, and end record
unsigned char const loc[] = {'P', 'K', 3, 4, 20, 0, 8, 0, 8, 0};
unsigned char const cen[] = {'P', 'K', 1, 2, 20, 0, 20, 0, 8, 0, 8, 0};
unsigned char const end[] = {'P', 'K', 5, 6, 0, 0, 0, 0, 1, 0, 1, 0};
// gzip header
unsigned char head[10];
// zip file modification date, CRC, and sizes -- initialize to zero for the
// local header (the actual CRC and sizes follow the compressed data)
unsigned char desc[16] = {0};
// name from gzip header to use for the zip entry (the maximum size of the
// name is 64K-1 -- if the gzip name is longer, then it is truncated)
unsigned name_len;
char save[65535];
// read and interpret the gzip header, bailing if it is invalid or has an
// unknown compression method or flag bits set
size_t got = fread(head, 1, sizeof(head), in);
if (got < sizeof(head) ||
head[0] != 0x1f || head[1] != 0x8b || head[2] != 8 || (head[3] & 0xe0))
bail("input not gzip");
if (head[3] & 4) { // extra field (ignore)
unsigned extra = getc(in);
int high = getc(in);
if (high == EOF)
bail("premature end of gzip input");
extra += (unsigned)high << 8;
fread(name, 1, extra, in);
}
if (head[3] & 8) { // file name (save)
name_len = 0;
int ch;
while ((ch = getc(in)) != 0 && ch != EOF)
if (name_len < sizeof(name))
save[name_len++] = ch;
}
else { // no file name
name_len = 1;
save[0] = '-';
}
if (head[3] & 16) { // comment (ignore)
int ch;
while ((ch = getc(in)) != 0 && ch != EOF)
;
}
if (head[3] & 2) { // header crc (ignore)
getc(in);
getc(in);
}
// use name from argument if present, otherwise from gzip header
if (name == NULL)
name = save;
else {
name_len = strlen(name);
if (name_len > 65535)
name_len = 65535;
}
// set modification time and date in descriptor from gzip header
time_t mod = head[4] + (head[5] << 8) + ((time_t)(head[6]) << 16) +
((time_t)(head[7]) << 24);
unix2dos(desc, mod ? mod : time(NULL));
// initialize tally of output bytes
tally_t zip = {out, 0};
// write zip local header
off_t locoff = zip.off;
put(&zip, loc, sizeof(loc));
put(&zip, desc, sizeof(desc));
put2(&zip, name_len);
putz(&zip, 2);
put(&zip, name, name_len);
// copy raw deflate stream, saving eight-byte gzip trailer
unsigned char buf[CHUNK + 8];
if (fread(buf, 1, 8, in) != 8)
bail("premature end of gzip input");
off_t comp = 0;
while ((got = fread(buf + 8, 1, CHUNK, in)) != 0) {
put(&zip, buf, got);
comp += got;
memmove(buf, buf + got, 8);
}
// write descriptor based on gzip trailer and compressed count
memcpy(desc + 4, buf, 4);
desc[8] = comp;
desc[9] = comp >> 8;
desc[10] = comp >> 16;
desc[11] = comp >> 24;
memcpy(desc + 12, buf + 4, 4);
put(&zip, desc + 4, sizeof(desc) - 4);
// write zip central directory
off_t cenoff = zip.off;
put(&zip, cen, sizeof(cen));
put(&zip, desc, sizeof(desc));
put2(&zip, name_len);
putz(&zip, 12);
put4(&zip, locoff);
put(&zip, name, name_len);
// write zip end-of-central-directory record
off_t endoff = zip.off;
put(&zip, end, sizeof(end));
put4(&zip, endoff - cenoff);
put4(&zip, cenoff);
putz(&zip, 2);
}
// Convert the gzip file on stdin to a zip file on stdout. If present, the
// first argument is used as the file name in the zip entry.
int main(int argc, char **argv) {
// avoid end-of-line conversions on evil operating systems
SET_BINARY_MODE(stdin);
SET_BINARY_MODE(stdout);
// convert .gz on stdin to .zip on stdout -- error returns use exit()
gz2zip(stdin, stdout, argc > 1 ? argv[1] : NULL);
return 0;
}

Related

How can I read \x1a from a file? [duplicate]

I am attempting to write a bittorrent client. In order to parse the file etc. I need to read a torrent file into memory. I have noticed that fread is not reading the entire file into my buffer. After further investigation it appears that whenever the symbol shown below is encountered in the file, fread stops reading the file. Calling the feof function on the FILE* pointer returns 16 indicating that the end of file has been reached. This occurs no matter where the symbol is placed. Can somebody explain why this happens and any solutions that may work.
The symbol is highlighted below:
Here is the code that does the read operation:
char *read_file(const char *file, long long *len){
struct stat st;
char *ret = NULL;
FILE *fp;
//store the size/length of the file
if(stat(file, &st)){
return ret;
}
*len = st.st_size;
//open a stream to the specified file
fp = fopen(file, "r");
if(!fp){
return ret;
}
//allocate space in the buffer for the file
ret = (char*)malloc(*len);
if(!ret){
return NULL;
}
//Break down the call to fread into smaller chunks
//to account for a known bug which causes fread to
//behave strangely with large files
//Read the file into the buffer
//fread(ret, 1, *len, fp);
if(*len > 10000){
char *retTemp = NULL;
retTemp = ret;
int remaining = *len;
int read = 0, error = 0;
while(remaining > 1000){
read = fread(retTemp, 1, 1000, fp);
if(read < 1000){
error = feof(fp);
if(error != 0){
printf("Error: %d\n", error);
}
}
retTemp += 1000;
remaining -= 1000;
}
fread(retTemp, 1, remaining, fp);
} else {
fread(ret, 1, *len, fp);
}
//cleanup by closing the file stream
fclose(fp);
return ret;
}
Thank you for your time :)
Your question is oddly relevant as I recently ran into this problem in an application here at work last week!
The ASCII value of this character is decimal 26 (0x1A, \SUB, SUBSTITUTE). This is used to represent the CTRL+Z key sequence or an End-of-File marker.
Change your fopen mode ("In [Text] mode, CTRL+Z is interpreted as an end-of-file character on input.") to get around this on Windows:
fp = fopen(file, "rb"); /* b for 'binary', disables Text-mode translations */
You should open the file in binary mode. Some platforms, in text (default) mode, interpret some bytes as being physical end of file markers.
You're opening the file in text rather than raw/binary mode - the arrow is ASCII for EOF. Specify "rb" rather than just "r" for your fopen call.

How to calculate CRC32 over blocks that are splitted and buffered of a large data?

Let's say I have a 1024kb data, which is 1kB buffered and transfered 1024 times from a transmitter to a receiver.
The last buffer contains a calculated CRC32 value as the last 4 bytes.
However, the receiver has to calculate the CRC32 buffer by buffer, because of the RAM constraints.
I wonder how to apply a linear distributed addition of CRC32 calculations to match the total CRC32 value.
I looked at CRC calculation and its distributive preference. The calculation and its linearity is not much clear to implement.
So, is there a mathematical expression for addition of calculated CRC32s over buffers to match with the CRC32 result which is calculated over total?
Such as:
int CRC32Total = 0;
int CRC32[1024];
for(int i = 0; i < 1024; i++){
CRC32Total = CRC32Total + CRC32[i];
}
Kind Regards
You did not provide any clues as to what implementation or even what language for which you "looked at CRC calculation". However every implementation I've seen is designed to compute CRCs piecemeal, exactly like you want.
For the crc32() routine provided in zlib, it is used thusly (in C):
crc = crc32(0, NULL, 0); // initialize CRC value
crc = crc32(crc, firstchunk, 1024); // update CRC value with first chunk
crc = crc32(crc, secondchunk, 1024); // update CRC with second chunk
...
crc = crc32(crc, lastchunk, 1024); // complete CRC with the last chunk
Then crc is the CRC of the concatenation of all of the chunks. You do not need a function to combine the CRCs of individual chunks.
If for some other reason you do want a function to combine CRCs, e.g. if you need to split the CRC calculation over multiple CPUs, then zlib provides the crc32_combine() function for that purpose.
When you start the transfer, reset the CrcChecksum to its initial value with the OnFirstBlock method. For every block received, call the OnBlockReceived to update the checksum. Note that the blocks must be processed in the correct order. When the final block has been processed, the final CRC is in the CrcChecksum variable.
// In crc32.c
uint32_t UpdateCrc(uint32_t crc, const void *data, size_t length)
const uint8_t *current = data;
while (length--)
crc = (crc >> 8) ^ Crc32Lookup[(crc & 0xFF) ^ *current++];
}
// In your block processing application
static uint32_t CrcChecksum;
void OnFirstBlock(void) {
CrcChecksum = 0;
}
void OnBlockReceived(const void *data, size_t length) {
CrcChecksum = UpdateCrc(CrcChecksum, data, length);
}
To complement my comment to your question, I have added code here that goes thru the whole process: data generation as a linear array, CRC32 added to the transmitted data, injection of errors, and reception in 'chunks' with computed CRC32 and detection of errors. You're probably only interested in the 'reception' part, but I think having a complete example makes it more clear for your comprehension.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
// ---------------------- buildCRC32table ------------------------------
static const uint32_t CRC32_POLY = 0xEDB88320;
static const uint32_t CRC32_XOR_MASK = 0xFFFFFFFF;
static uint32_t CRC32TABLE[256];
void buildCRC32table (void)
{
uint32_t crc32;
for (uint16_t byte = 0; byte < 256; byte++)
{
crc32 = byte;
// iterate thru all 8 bits
for (int i = 0; i < 8; i++)
{
uint8_t feedback = crc32 & 1;
crc32 = (crc32 >> 1);
if (feedback)
{
crc32 ^= CRC32_POLY;
}
}
CRC32TABLE[byte] = crc32;
}
}
// -------------------------- myCRC32 ----------------------------------
uint32_t myCRC32 (uint32_t previousCRC32, uint8_t *pData, int dataLen)
{
uint32_t newCRC32 = previousCRC32 ^ CRC32_XOR_MASK; // remove last XOR mask (or add first)
// add new data to CRC32
while (dataLen--)
{
uint32_t crc32Top24bits = newCRC32 >> 8;
uint8_t crc32Low8bits = newCRC32 & 0x000000FF;
uint8_t data = *pData++;
newCRC32 = crc32Top24bits ^ CRC32TABLE[crc32Low8bits ^ data];
}
newCRC32 ^= CRC32_XOR_MASK; // put XOR mask back
return newCRC32;
}
// ------------------------------ main ---------------------------------
int main()
{
// build CRC32 table
buildCRC32table();
uint32_t crc32;
// use a union so we can access the same data linearly (TX) or by chunks (RX)
union
{
uint8_t array[1024*1024];
uint8_t chunk[1024][1024];
} data;
// use time to seed randomizer so we have different data every run
srand((unsigned int)time(NULL));
/////////////////////////////////////////////////////////////////////////// Build data to be transmitted
////////////////////////////////////////////////////////////////////////////////////////////////////////
// populate array with random data sparing space for the CRC32 at the end
for (int i = 0; i < (sizeof(data.array) - sizeof(uint32_t)); i++)
{
data.array[i] = (uint8_t) (rand() & 0xFF);
}
// now compute array's CRC32
crc32 = myCRC32(0, data.array, sizeof(data.array) - sizeof(uint32_t));
printf ("array CRC32 = 0x%08X\n", crc32);
// to store the CRC32 into the array, we want to remove the XOR mask so we can compute the CRC32
// of all received data (including the CRC32 itself) and expect the same result all the time,
// regardless of the data, when no errors are present
crc32 ^= CRC32_XOR_MASK;
// load CRC32 at the very end of the array
data.array[sizeof(data.array) - 1] = (uint8_t)((crc32 >> 24) & 0xFF);
data.array[sizeof(data.array) - 2] = (uint8_t)((crc32 >> 16) & 0xFF);
data.array[sizeof(data.array) - 3] = (uint8_t)((crc32 >> 8) & 0xFF);
data.array[sizeof(data.array) - 4] = (uint8_t)((crc32 >> 0) & 0xFF);
/////////////////////////////////////////////// At this point, data is transmitted and errors may happen
////////////////////////////////////////////////////////////////////////////////////////////////////////
// to make things interesting, let's add one bit error with 1/8 probability
if ((rand() % 8) == 0)
{
uint32_t index = rand() % sizeof(data.array);
uint8_t errorBit = 1 << (rand() & 0x7);
// add error
data.array[index] ^= errorBit;
printf("Error injected on byte %u, bit mask = 0x%02X\n", index, errorBit);
}
else
{
printf("No error injected\n");
}
/////////////////////////////////////////////////////// Once received, the data is processed in 'chunks'
////////////////////////////////////////////////////////////////////////////////////////////////////////
// now we access the data and compute its CRC32 one chunk at a time
crc32 = 0; // initialize CRC32
for (int i = 0; i < 1024; i++)
{
crc32 = myCRC32(crc32, data.chunk[i], sizeof data.chunk[i]);
}
printf ("Final CRC32 = 0x%08X\n", crc32);
// because the CRC32 algorithm applies an XOR mask at the end, when we have no errors, the computed
// CRC32 will be the mask itself
if (crc32 == CRC32_XOR_MASK)
{
printf ("No errors detected!\n");
}
else
{
printf ("Errors detected!\n");
}
}

Webm (VP8 / Opus) file read and write back

I am trying to develop a webrtc simulator in C/C++. For media handling, I plan to use libav. I am thinking of below steps to realize media exchange between two webrtc simulator. Say I have two webrtc simulators A and B.
Read media at A from a input webm file using av_read_frame api.
I assume I will get the encoded media (audio / video) data, am I correct here?
Send the encoded media data to simulator B over a UDP socket.
Simulator B receives the media data in UDP socket as RTP packets.
Simulator B extracts audio/video data from just received RTP packet.
I assume the extracted media data at simulator B are the encoded data only (am I correct here). I do not want to decode it. I want to write it to a file. Later I will play the file to check if I have done everything right.
To simplify this problem lets take out UDP socket part. Then my question reduces to read data from a webm input file, get the encoded media, prepare the packet and write to a output file using av_interleaved_write_frame or any other appropriate api. All these things I want to do using libav.
Is there any example code I can refer.
Or can somebody please guide me to develop it.
I am trying with a test program. As a first step, my aim is to read from a file and write to an output file. I have below code, but it is not working properly.
//#define _AUDIO_WRITE_ENABLED_
#include "libavutil/imgutils.h"
#include "libavutil/samplefmt.h"
#include "libavformat/avformat.h"
static AVPacket pkt;
static AVFormatContext *fmt_ctx = NULL;
static AVFormatContext *av_format_context = NULL;
static AVOutputFormat *av_output_format = NULL;
static AVCodec *video_codec = NULL;
static AVStream *video_stream = NULL;
static AVCodec *audio_codec = NULL;
static AVStream *audio_stream = NULL;
static const char *src_filename = NULL;
static const char *dst_filename = NULL;
int main (int argc, char **argv)
{
int ret = 0;
int index = 0;
if (argc != 3)
{
printf("Usage: ./webm input_video_file output_video_file \n");
exit(0);
}
src_filename = argv[1];
dst_filename = argv[2];
printf("Source file = %s , Destination file = %s\n", src_filename, dst_filename);
av_register_all();
/* open input file, and allocate format context */
if (avformat_open_input(&fmt_ctx, src_filename, NULL, NULL) < 0)
{
fprintf(stderr, "Could not open source file %s\n", src_filename);
exit(1);
}
/* retrieve stream information */
if (avformat_find_stream_info(fmt_ctx, NULL) < 0)
{
fprintf(stderr, "Could not find stream information\n");
exit(2);
}
av_output_format = av_guess_format(NULL, dst_filename, NULL);
if(!av_output_format)
{
fprintf(stderr, "Could not guess output file format\n");
exit(3);
}
av_output_format->audio_codec = AV_CODEC_ID_VORBIS;
av_output_format->video_codec = AV_CODEC_ID_VP8;
av_format_context = avformat_alloc_context();
if(!av_format_context)
{
fprintf(stderr, "Could not allocation av format context\n");
exit(4);
}
av_format_context->oformat = av_output_format;
strcpy(av_format_context->filename, dst_filename);
video_codec = avcodec_find_encoder(av_output_format->video_codec);
if (!video_codec)
{
fprintf(stderr, "Codec not found\n");
exit(5);
}
video_stream = avformat_new_stream(av_format_context, video_codec);
if (!video_stream)
{
fprintf(stderr, "Could not alloc stream\n");
exit(6);
}
avcodec_get_context_defaults3(video_stream->codec, video_codec);
video_stream->codec->codec_id = AV_CODEC_ID_VP8;
video_stream->codec->codec_type = AVMEDIA_TYPE_VIDEO;
video_stream->time_base = (AVRational) {1, 30};
video_stream->codec->width = 640;
video_stream->codec->height = 480;
video_stream->codec->pix_fmt = PIX_FMT_YUV420P;
video_stream->codec->flags |= CODEC_FLAG_GLOBAL_HEADER;
video_stream->codec->bit_rate = 400000;
video_stream->codec->gop_size = 10;
video_stream->codec->max_b_frames=1;
#ifdef _AUDIO_WRITE_ENABLED_
audio_codec = avcodec_find_encoder(av_output_format->audio_codec);
if (!audio_codec)
{
fprintf(stderr, "Codec not found audio codec\n");
exit(5);
}
audio_stream = avformat_new_stream(av_format_context, audio_codec);
if (!audio_stream)
{
fprintf(stderr, "Could not alloc stream for audio\n");
exit(6);
}
avcodec_get_context_defaults3(audio_stream->codec, audio_codec);
audio_stream->codec->codec_id = AV_CODEC_ID_VORBIS;
audio_stream->codec->codec_type = AVMEDIA_TYPE_AUDIO;
audio_stream->time_base = (AVRational) {1, 30};
audio_stream->codec->sample_rate = 8000;
audio_stream->codec->flags |= CODEC_FLAG_GLOBAL_HEADER;
#endif
if(!(av_output_format->flags & AVFMT_NOFILE))
{
if (avio_open(&av_format_context->pb, dst_filename, AVIO_FLAG_WRITE) < 0)
{
fprintf(stderr, "Could not open '%s'\n", dst_filename);
}
}
/* Before avformat_write_header set the stream */
avformat_write_header(av_format_context, NULL);
/* initialize packet, set data to NULL, let the demuxer fill it */
av_init_packet(&pkt);
pkt.data = NULL;
pkt.size = 0;
pkt.stream_index = video_stream->index;
ret = av_read_frame(fmt_ctx, &pkt);
while (ret >= 0)
{
index++;
//pkt.stream_index = video_avstream->index;
if(pkt.stream_index == video_stream->index)
{
printf("Video: Read cycle %d, bytes read = %d, pkt stream index=%d\n", index, pkt.size, pkt.stream_index);
av_write_frame(av_format_context, &pkt);
}
#ifdef _AUDIO_WRITE_ENABLED_
else if(pkt.stream_index == audio_stream->index)
{
printf("Audio: Read cycle %d, bytes read = %d, pkt stream index=%d\n", index, pkt.size, pkt.stream_index);
av_write_frame(av_format_context, &pkt);
}
#endif
av_free_packet(&pkt);
ret = av_read_frame(fmt_ctx, &pkt);
}
av_write_trailer(av_format_context);
/** Exit procedure starts */
avformat_close_input(&fmt_ctx);
avformat_free_context(av_format_context);
return 0;
}
When I execute this program, it outputs "codec not found". Now sure whats going wrong, Can somebody help please.
Codec not found issue is resolved by separately building libvpx1.4 version. Still struggling to read from source file, and writing to a destination file.
EDIT 1: After code modification, only video stuff I am able to write to a file, though some more errors are still present.
EDIT 2: With modified code (2nd round), I see video frames are written properly. For audio frames I added the code under a macro _AUDIO_WRITE_ENABLED_ , but if I enable this macro program crashing. Can somebody guide whats wrong in audio write part (code under macro _AUDIO_WRITE_ENABLED_).
I am not fully answering your question, but I hope we will get to the final solution eventually. When I tried to run your code, I got this error "time base not set".
Time base and other header specs are part of codec. This is, how I have this thing specified for writing into file (vStream is of AVStream):
#if LIBAVCODEC_VER_AT_LEAST(53, 21)
avcodec_get_context_defaults3(rc->vStream->codec, AVMEDIA_TYPE_VIDEO);
#else
avcodec_get_context_defaults2(rc->vStream->codec, AVMEDIA_TYPE_VIDEO);
#endif
#if LIBAVCODEC_VER_AT_LEAST(54, 25)
vStream->codec->codec_id = AV_CODEC_ID_VP8;
#else
vStream->codec->codec_id = CODEC_ID_VP8;
#endif
vStream->codec->codec_type = AVMEDIA_TYPE_VIDEO;
vStream->codec->time_base = (AVRational) {1, 30};
vStream->codec->width = 640;
vStream->codec->height = 480;
vStream->codec->pix_fmt = PIX_FMT_YUV420P;
EDIT: I ran your program in Valgrind and it segfaults on av_write_frame. Looks like its time_base and other specs for output are not set properly.
Add the specs before avformat_write_header(), before it is too late.

How to find Nal header in h.264 RTP packet

I need to find the NAL header by parsing a RTP packet where each NAL unit is encapsulated into one RTP packet, then i parse the Nal header to know whether it's a PPS unit or not. I tried the following but i got no result:
dataBuffer = (char*)MESSAGE_ReturnPacket(msg);
byte * hdr = (byte*)dataBuffer + RTP_HDR_SIZE; //dataBuffer contains the RTP packet
RTPParsing((byte*)dataBuffer,rp,hdr);
if (rp.nal_type == 8 )
{
printf("\n PPS is found \n");
}
else
{
printf("\n No PPS is found\n");
}
where
int RTPParsing(byte *pData,RTPpacket_t &rp, byte *hdr)
{
if ((pData[0] & 0xc0) != (2 << 6)){
printf("[RTP] version is incorrect! dump = 0x%x 0x%x 0x%x 0x%x
\n",pData[0], pData[1], pData[2], pData[3]);
return 0;
}
/* parse RTP header */
rp.v = (pData[0] & 0xc0) >> 6; /* protocol version */
rp.p = (pData[0] & 0x40) >> 5; /* padding flag */
rp.x = (pData[0] & 0x20) >> 4; /* header extension flag */
rp.cc = (pData[0] & 0x0f); /* CSRC count */
rp.m = (pData[1] & 0x80) >> 7; /* marker bit */
rp.pt = (pData[1] & 0x7F); //Payload Type
rp.seq = ntohs (((unsigned short *) pData)[1]); /* sequence number */
rp.timestamp = ntohl (((unsigned int *) pData)[1]); /* timestamp */
rp.ssrc = ntohl (((unsigned int *) pData)[2]); /* synchronization source */
rp.nal_type = (hdr[1] & 0x1F); // get NAL unit's type
if (rp.cc)
{
for (int i = 0; i < rp.cc; i++)
{
//fprintf (out, " csrc: 0x%08x",ntohl (((unsigned int *) data)[3 + i]));
}
}
return 0;
}
Any help ?
According to RFC6184 in single NAL unit mode, the "The first byte of a NAL unit co-serves as the
RTP payload header"
You offset is incorrect (1 instead of 0):
rp.nal_type = (hdr[1] & 0x1F); // get NAL unit's type
Also, hard-coding RTP_HDR_SIZE as 12 (if that's what you're doing) could cause issues since the size of the header may vary based on extension headers, CSRCs, etc.

How to use VideoToolbox to decompress H.264 video stream

I had a lot of trouble figuring out how to use Apple's Hardware accelerated video framework to decompress an H.264 video stream. After a few weeks I figured it out and wanted to share an extensive example since I couldn't find one.
My goal is to give a thorough, instructive example of Video Toolbox introduced in WWDC '14 session 513. My code will not compile or run since it needs to be integrated with an elementary H.264 stream (like a video read from a file or streamed from online etc) and needs to be tweaked depending on the specific case.
I should mention that I have very little experience with video en/decoding except what I learned while googling the subject. I don't know all the details about video formats, parameter structure etc. so I've only included what I think you need to know.
I am using XCode 6.2 and have deployed to iOS devices that are running iOS 8.1 and 8.2.
Concepts:
NALUs: NALUs are simply a chunk of data of varying length that has a NALU start code header 0x00 00 00 01 YY where the first 5 bits of YY tells you what type of NALU this is and therefore what type of data follows the header. (Since you only need the first 5 bits, I use YY & 0x1F to just get the relevant bits.) I list what all these types are in the method NSString * const naluTypesStrings[], but you don't need to know what they all are.
Parameters: Your decoder needs parameters so it knows how the H.264 video data is stored. The 2 you need to set are Sequence Parameter Set (SPS) and Picture Parameter Set (PPS) and they each have their own NALU type number. You don't need to know what the parameters mean, the decoder knows what to do with them.
H.264 Stream Format: In most H.264 streams, you will receive with an initial set of PPS and SPS parameters followed by an i frame (aka IDR frame or flush frame) NALU. Then you will receive several P frame NALUs (maybe a few dozen or so), then another set of parameters (which may be the same as the initial parameters) and an i frame, more P frames, etc. i frames are much bigger than P frames. Conceptually you can think of the i frame as an entire image of the video, and the P frames are just the changes that have been made to that i frame, until you receive the next i frame.
Procedure:
Generate individual NALUs from your H.264 stream. I cannot show code for this step since it depends a lot on what video source you're using. I made this graphic to show what I was working with ("data" in the graphic is "frame" in my following code), but your case may and probably will differ. My method receivedRawVideoFrame: is called every time I receive a frame (uint8_t *frame) which was one of 2 types. In the diagram, those 2 frame types are the 2 big purple boxes.
Create a CMVideoFormatDescriptionRef from your SPS and PPS NALUs with CMVideoFormatDescriptionCreateFromH264ParameterSets( ). You cannot display any frames without doing this first. The SPS and PPS may look like a jumble of numbers, but VTD knows what to do with them. All you need to know is that CMVideoFormatDescriptionRef is a description of video data., like width/height, format type (kCMPixelFormat_32BGRA, kCMVideoCodecType_H264 etc.), aspect ratio, color space etc. Your decoder will hold onto the parameters until a new set arrives (sometimes parameters are resent regularly even when they haven't changed).
Re-package your IDR and non-IDR frame NALUs according to the "AVCC" format. This means removing the NALU start codes and replacing them with a 4-byte header that states the length of the NALU. You don't need to do this for the SPS and PPS NALUs. (Note that the 4-byte NALU length header is in big-endian, so if you have a UInt32 value it must be byte-swapped before copying to the CMBlockBuffer using CFSwapInt32. I do this in my code with the htonl function call.)
Package the IDR and non-IDR NALU frames into CMBlockBuffer. Do not do this with the SPS PPS parameter NALUs. All you need to know about CMBlockBuffers is that they are a method to wrap arbitrary blocks of data in core media. (Any compressed video data in a video pipeline is wrapped in this.)
Package the CMBlockBuffer into CMSampleBuffer. All you need to know about CMSampleBuffers is that they wrap up our CMBlockBuffers with other information (here it would be the CMVideoFormatDescription and CMTime, if CMTime is used).
Create a VTDecompressionSessionRef and feed the sample buffers into VTDecompressionSessionDecodeFrame( ). Alternatively, you can use AVSampleBufferDisplayLayer and its enqueueSampleBuffer: method and you won't need to use VTDecompSession. It's simpler to set up, but will not throw errors if something goes wrong like VTD will.
In the VTDecompSession callback, use the resultant CVImageBufferRef to display the video frame. If you need to convert your CVImageBuffer to a UIImage, see my StackOverflow answer here.
Other notes:
H.264 streams can vary a lot. From what I learned, NALU start code headers are sometimes 3 bytes (0x00 00 01) and sometimes 4 (0x00 00 00 01). My code works for 4 bytes; you will need to change a few things around if you're working with 3.
If you want to know more about NALUs, I found this answer to be very helpful. In my case, I found that I didn't need to ignore the "emulation prevention" bytes as described, so I personally skipped that step but you may need to know about that.
If your VTDecompressionSession outputs an error number (like -12909) look up the error code in your XCode project. Find the VideoToolbox framework in your project navigator, open it and find the header VTErrors.h. If you can't find it, I've also included all the error codes below in another answer.
Code Example:
So let's start by declaring some global variables and including the VT framework (VT = Video Toolbox).
#import <VideoToolbox/VideoToolbox.h>
#property (nonatomic, assign) CMVideoFormatDescriptionRef formatDesc;
#property (nonatomic, assign) VTDecompressionSessionRef decompressionSession;
#property (nonatomic, retain) AVSampleBufferDisplayLayer *videoLayer;
#property (nonatomic, assign) int spsSize;
#property (nonatomic, assign) int ppsSize;
The following array is only used so that you can print out what type of NALU frame you are receiving. If you know what all these types mean, good for you, you know more about H.264 than me :) My code only handles types 1, 5, 7 and 8.
NSString * const naluTypesStrings[] =
{
#"0: Unspecified (non-VCL)",
#"1: Coded slice of a non-IDR picture (VCL)", // P frame
#"2: Coded slice data partition A (VCL)",
#"3: Coded slice data partition B (VCL)",
#"4: Coded slice data partition C (VCL)",
#"5: Coded slice of an IDR picture (VCL)", // I frame
#"6: Supplemental enhancement information (SEI) (non-VCL)",
#"7: Sequence parameter set (non-VCL)", // SPS parameter
#"8: Picture parameter set (non-VCL)", // PPS parameter
#"9: Access unit delimiter (non-VCL)",
#"10: End of sequence (non-VCL)",
#"11: End of stream (non-VCL)",
#"12: Filler data (non-VCL)",
#"13: Sequence parameter set extension (non-VCL)",
#"14: Prefix NAL unit (non-VCL)",
#"15: Subset sequence parameter set (non-VCL)",
#"16: Reserved (non-VCL)",
#"17: Reserved (non-VCL)",
#"18: Reserved (non-VCL)",
#"19: Coded slice of an auxiliary coded picture without partitioning (non-VCL)",
#"20: Coded slice extension (non-VCL)",
#"21: Coded slice extension for depth view components (non-VCL)",
#"22: Reserved (non-VCL)",
#"23: Reserved (non-VCL)",
#"24: STAP-A Single-time aggregation packet (non-VCL)",
#"25: STAP-B Single-time aggregation packet (non-VCL)",
#"26: MTAP16 Multi-time aggregation packet (non-VCL)",
#"27: MTAP24 Multi-time aggregation packet (non-VCL)",
#"28: FU-A Fragmentation unit (non-VCL)",
#"29: FU-B Fragmentation unit (non-VCL)",
#"30: Unspecified (non-VCL)",
#"31: Unspecified (non-VCL)",
};
Now this is where all the magic happens.
-(void) receivedRawVideoFrame:(uint8_t *)frame withSize:(uint32_t)frameSize isIFrame:(int)isIFrame
{
OSStatus status;
uint8_t *data = NULL;
uint8_t *pps = NULL;
uint8_t *sps = NULL;
// I know what my H.264 data source's NALUs look like so I know start code index is always 0.
// if you don't know where it starts, you can use a for loop similar to how i find the 2nd and 3rd start codes
int startCodeIndex = 0;
int secondStartCodeIndex = 0;
int thirdStartCodeIndex = 0;
long blockLength = 0;
CMSampleBufferRef sampleBuffer = NULL;
CMBlockBufferRef blockBuffer = NULL;
int nalu_type = (frame[startCodeIndex + 4] & 0x1F);
NSLog(#"~~~~~~~ Received NALU Type \"%#\" ~~~~~~~~", naluTypesStrings[nalu_type]);
// if we havent already set up our format description with our SPS PPS parameters, we
// can't process any frames except type 7 that has our parameters
if (nalu_type != 7 && _formatDesc == NULL)
{
NSLog(#"Video error: Frame is not an I Frame and format description is null");
return;
}
// NALU type 7 is the SPS parameter NALU
if (nalu_type == 7)
{
// find where the second PPS start code begins, (the 0x00 00 00 01 code)
// from which we also get the length of the first SPS code
for (int i = startCodeIndex + 4; i < startCodeIndex + 40; i++)
{
if (frame[i] == 0x00 && frame[i+1] == 0x00 && frame[i+2] == 0x00 && frame[i+3] == 0x01)
{
secondStartCodeIndex = i;
_spsSize = secondStartCodeIndex; // includes the header in the size
break;
}
}
// find what the second NALU type is
nalu_type = (frame[secondStartCodeIndex + 4] & 0x1F);
NSLog(#"~~~~~~~ Received NALU Type \"%#\" ~~~~~~~~", naluTypesStrings[nalu_type]);
}
// type 8 is the PPS parameter NALU
if(nalu_type == 8)
{
// find where the NALU after this one starts so we know how long the PPS parameter is
for (int i = _spsSize + 4; i < _spsSize + 30; i++)
{
if (frame[i] == 0x00 && frame[i+1] == 0x00 && frame[i+2] == 0x00 && frame[i+3] == 0x01)
{
thirdStartCodeIndex = i;
_ppsSize = thirdStartCodeIndex - _spsSize;
break;
}
}
// allocate enough data to fit the SPS and PPS parameters into our data objects.
// VTD doesn't want you to include the start code header (4 bytes long) so we add the - 4 here
sps = malloc(_spsSize - 4);
pps = malloc(_ppsSize - 4);
// copy in the actual sps and pps values, again ignoring the 4 byte header
memcpy (sps, &frame[4], _spsSize-4);
memcpy (pps, &frame[_spsSize+4], _ppsSize-4);
// now we set our H264 parameters
uint8_t* parameterSetPointers[2] = {sps, pps};
size_t parameterSetSizes[2] = {_spsSize-4, _ppsSize-4};
// suggestion from #Kris Dude's answer below
if (_formatDesc)
{
CFRelease(_formatDesc);
_formatDesc = NULL;
}
status = CMVideoFormatDescriptionCreateFromH264ParameterSets(kCFAllocatorDefault, 2,
(const uint8_t *const*)parameterSetPointers,
parameterSetSizes, 4,
&_formatDesc);
NSLog(#"\t\t Creation of CMVideoFormatDescription: %#", (status == noErr) ? #"successful!" : #"failed...");
if(status != noErr) NSLog(#"\t\t Format Description ERROR type: %d", (int)status);
// See if decomp session can convert from previous format description
// to the new one, if not we need to remake the decomp session.
// This snippet was not necessary for my applications but it could be for yours
/*BOOL needNewDecompSession = (VTDecompressionSessionCanAcceptFormatDescription(_decompressionSession, _formatDesc) == NO);
if(needNewDecompSession)
{
[self createDecompSession];
}*/
// now lets handle the IDR frame that (should) come after the parameter sets
// I say "should" because that's how I expect my H264 stream to work, YMMV
nalu_type = (frame[thirdStartCodeIndex + 4] & 0x1F);
NSLog(#"~~~~~~~ Received NALU Type \"%#\" ~~~~~~~~", naluTypesStrings[nalu_type]);
}
// create our VTDecompressionSession. This isnt neccessary if you choose to use AVSampleBufferDisplayLayer
if((status == noErr) && (_decompressionSession == NULL))
{
[self createDecompSession];
}
// type 5 is an IDR frame NALU. The SPS and PPS NALUs should always be followed by an IDR (or IFrame) NALU, as far as I know
if(nalu_type == 5)
{
// find the offset, or where the SPS and PPS NALUs end and the IDR frame NALU begins
int offset = _spsSize + _ppsSize;
blockLength = frameSize - offset;
data = malloc(blockLength);
data = memcpy(data, &frame[offset], blockLength);
// replace the start code header on this NALU with its size.
// AVCC format requires that you do this.
// htonl converts the unsigned int from host to network byte order
uint32_t dataLength32 = htonl (blockLength - 4);
memcpy (data, &dataLength32, sizeof (uint32_t));
// create a block buffer from the IDR NALU
status = CMBlockBufferCreateWithMemoryBlock(NULL, data, // memoryBlock to hold buffered data
blockLength, // block length of the mem block in bytes.
kCFAllocatorNull, NULL,
0, // offsetToData
blockLength, // dataLength of relevant bytes, starting at offsetToData
0, &blockBuffer);
NSLog(#"\t\t BlockBufferCreation: \t %#", (status == kCMBlockBufferNoErr) ? #"successful!" : #"failed...");
}
// NALU type 1 is non-IDR (or PFrame) picture
if (nalu_type == 1)
{
// non-IDR frames do not have an offset due to SPS and PSS, so the approach
// is similar to the IDR frames just without the offset
blockLength = frameSize;
data = malloc(blockLength);
data = memcpy(data, &frame[0], blockLength);
// again, replace the start header with the size of the NALU
uint32_t dataLength32 = htonl (blockLength - 4);
memcpy (data, &dataLength32, sizeof (uint32_t));
status = CMBlockBufferCreateWithMemoryBlock(NULL, data, // memoryBlock to hold data. If NULL, block will be alloc when needed
blockLength, // overall length of the mem block in bytes
kCFAllocatorNull, NULL,
0, // offsetToData
blockLength, // dataLength of relevant data bytes, starting at offsetToData
0, &blockBuffer);
NSLog(#"\t\t BlockBufferCreation: \t %#", (status == kCMBlockBufferNoErr) ? #"successful!" : #"failed...");
}
// now create our sample buffer from the block buffer,
if(status == noErr)
{
// here I'm not bothering with any timing specifics since in my case we displayed all frames immediately
const size_t sampleSize = blockLength;
status = CMSampleBufferCreate(kCFAllocatorDefault,
blockBuffer, true, NULL, NULL,
_formatDesc, 1, 0, NULL, 1,
&sampleSize, &sampleBuffer);
NSLog(#"\t\t SampleBufferCreate: \t %#", (status == noErr) ? #"successful!" : #"failed...");
}
if(status == noErr)
{
// set some values of the sample buffer's attachments
CFArrayRef attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, YES);
CFMutableDictionaryRef dict = (CFMutableDictionaryRef)CFArrayGetValueAtIndex(attachments, 0);
CFDictionarySetValue(dict, kCMSampleAttachmentKey_DisplayImmediately, kCFBooleanTrue);
// either send the samplebuffer to a VTDecompressionSession or to an AVSampleBufferDisplayLayer
[self render:sampleBuffer];
}
// free memory to avoid a memory leak, do the same for sps, pps and blockbuffer
if (NULL != data)
{
free (data);
data = NULL;
}
}
The following method creates your VTD session. Recreate it whenever you receive new parameters. (You don't have to recreate it every time you receive parameters, pretty sure.)
If you want to set attributes for the destination CVPixelBuffer, read up on CoreVideo PixelBufferAttributes values and put them in NSDictionary *destinationImageBufferAttributes.
-(void) createDecompSession
{
// make sure to destroy the old VTD session
_decompressionSession = NULL;
VTDecompressionOutputCallbackRecord callBackRecord;
callBackRecord.decompressionOutputCallback = decompressionSessionDecodeFrameCallback;
// this is necessary if you need to make calls to Objective C "self" from within in the callback method.
callBackRecord.decompressionOutputRefCon = (__bridge void *)self;
// you can set some desired attributes for the destination pixel buffer. I didn't use this but you may
// if you need to set some attributes, be sure to uncomment the dictionary in VTDecompressionSessionCreate
NSDictionary *destinationImageBufferAttributes = [NSDictionary dictionaryWithObjectsAndKeys:
[NSNumber numberWithBool:YES],
(id)kCVPixelBufferOpenGLESCompatibilityKey,
nil];
OSStatus status = VTDecompressionSessionCreate(NULL, _formatDesc, NULL,
NULL, // (__bridge CFDictionaryRef)(destinationImageBufferAttributes)
&callBackRecord, &_decompressionSession);
NSLog(#"Video Decompression Session Create: \t %#", (status == noErr) ? #"successful!" : #"failed...");
if(status != noErr) NSLog(#"\t\t VTD ERROR type: %d", (int)status);
}
Now this method gets called every time VTD is done decompressing any frame you sent to it. This method gets called even if there's an error or if the frame is dropped.
void decompressionSessionDecodeFrameCallback(void *decompressionOutputRefCon,
void *sourceFrameRefCon,
OSStatus status,
VTDecodeInfoFlags infoFlags,
CVImageBufferRef imageBuffer,
CMTime presentationTimeStamp,
CMTime presentationDuration)
{
THISCLASSNAME *streamManager = (__bridge THISCLASSNAME *)decompressionOutputRefCon;
if (status != noErr)
{
NSError *error = [NSError errorWithDomain:NSOSStatusErrorDomain code:status userInfo:nil];
NSLog(#"Decompressed error: %#", error);
}
else
{
NSLog(#"Decompressed sucessfully");
// do something with your resulting CVImageBufferRef that is your decompressed frame
[streamManager displayDecodedFrame:imageBuffer];
}
}
This is where we actually send the sampleBuffer off to the VTD to be decoded.
- (void) render:(CMSampleBufferRef)sampleBuffer
{
VTDecodeFrameFlags flags = kVTDecodeFrame_EnableAsynchronousDecompression;
VTDecodeInfoFlags flagOut;
NSDate* currentTime = [NSDate date];
VTDecompressionSessionDecodeFrame(_decompressionSession, sampleBuffer, flags,
(void*)CFBridgingRetain(currentTime), &flagOut);
CFRelease(sampleBuffer);
// if you're using AVSampleBufferDisplayLayer, you only need to use this line of code
// [videoLayer enqueueSampleBuffer:sampleBuffer];
}
If you're using AVSampleBufferDisplayLayer, be sure to init the layer like this, in viewDidLoad or inside some other init method.
-(void) viewDidLoad
{
// create our AVSampleBufferDisplayLayer and add it to the view
videoLayer = [[AVSampleBufferDisplayLayer alloc] init];
videoLayer.frame = self.view.frame;
videoLayer.bounds = self.view.bounds;
videoLayer.videoGravity = AVLayerVideoGravityResizeAspect;
// set Timebase, you may need this if you need to display frames at specific times
// I didn't need it so I haven't verified that the timebase is working
CMTimebaseRef controlTimebase;
CMTimebaseCreateWithMasterClock(CFAllocatorGetDefault(), CMClockGetHostTimeClock(), &controlTimebase);
//videoLayer.controlTimebase = controlTimebase;
CMTimebaseSetTime(self.videoLayer.controlTimebase, kCMTimeZero);
CMTimebaseSetRate(self.videoLayer.controlTimebase, 1.0);
[[self.view layer] addSublayer:videoLayer];
}
If you can't find the VTD error codes in the framework, I decided to just include them here. (Again, all these errors and more can be found inside the VideoToolbox.framework itself in the project navigator, in the file VTErrors.h.)
You will get one of these error codes either in the the VTD decode frame callback or when you create your VTD session if you did something incorrectly.
kVTPropertyNotSupportedErr = -12900,
kVTPropertyReadOnlyErr = -12901,
kVTParameterErr = -12902,
kVTInvalidSessionErr = -12903,
kVTAllocationFailedErr = -12904,
kVTPixelTransferNotSupportedErr = -12905, // c.f. -8961
kVTCouldNotFindVideoDecoderErr = -12906,
kVTCouldNotCreateInstanceErr = -12907,
kVTCouldNotFindVideoEncoderErr = -12908,
kVTVideoDecoderBadDataErr = -12909, // c.f. -8969
kVTVideoDecoderUnsupportedDataFormatErr = -12910, // c.f. -8970
kVTVideoDecoderMalfunctionErr = -12911, // c.f. -8960
kVTVideoEncoderMalfunctionErr = -12912,
kVTVideoDecoderNotAvailableNowErr = -12913,
kVTImageRotationNotSupportedErr = -12914,
kVTVideoEncoderNotAvailableNowErr = -12915,
kVTFormatDescriptionChangeNotSupportedErr = -12916,
kVTInsufficientSourceColorDataErr = -12917,
kVTCouldNotCreateColorCorrectionDataErr = -12918,
kVTColorSyncTransformConvertFailedErr = -12919,
kVTVideoDecoderAuthorizationErr = -12210,
kVTVideoEncoderAuthorizationErr = -12211,
kVTColorCorrectionPixelTransferFailedErr = -12212,
kVTMultiPassStorageIdentifierMismatchErr = -12213,
kVTMultiPassStorageInvalidErr = -12214,
kVTFrameSiloInvalidTimeStampErr = -12215,
kVTFrameSiloInvalidTimeRangeErr = -12216,
kVTCouldNotFindTemporalFilterErr = -12217,
kVTPixelTransferNotPermittedErr = -12218,
A good Swift example of much of this can be found in Josh Baker's Avios library: https://github.com/tidwall/Avios
Note that Avios currently expects the user to handle chunking data at NAL start codes, but does handle decoding the data from that point forward.
Also worth a look is the Swift based RTMP library HaishinKit (formerly "LF"), which has its own decoding implementation, including more robust NALU parsing: https://github.com/shogo4405/lf.swift
In addition to VTErrors above, I thought it's worth adding CMFormatDescription, CMBlockBuffer, CMSampleBuffer errors that you may encounter while trying Livy's example.
kCMFormatDescriptionError_InvalidParameter = -12710,
kCMFormatDescriptionError_AllocationFailed = -12711,
kCMFormatDescriptionError_ValueNotAvailable = -12718,
kCMBlockBufferNoErr = 0,
kCMBlockBufferStructureAllocationFailedErr = -12700,
kCMBlockBufferBlockAllocationFailedErr = -12701,
kCMBlockBufferBadCustomBlockSourceErr = -12702,
kCMBlockBufferBadOffsetParameterErr = -12703,
kCMBlockBufferBadLengthParameterErr = -12704,
kCMBlockBufferBadPointerParameterErr = -12705,
kCMBlockBufferEmptyBBufErr = -12706,
kCMBlockBufferUnallocatedBlockErr = -12707,
kCMBlockBufferInsufficientSpaceErr = -12708,
kCMSampleBufferError_AllocationFailed = -12730,
kCMSampleBufferError_RequiredParameterMissing = -12731,
kCMSampleBufferError_AlreadyHasDataBuffer = -12732,
kCMSampleBufferError_BufferNotReady = -12733,
kCMSampleBufferError_SampleIndexOutOfRange = -12734,
kCMSampleBufferError_BufferHasNoSampleSizes = -12735,
kCMSampleBufferError_BufferHasNoSampleTimingInfo = -12736,
kCMSampleBufferError_ArrayTooSmall = -12737,
kCMSampleBufferError_InvalidEntryCount = -12738,
kCMSampleBufferError_CannotSubdivide = -12739,
kCMSampleBufferError_SampleTimingInfoInvalid = -12740,
kCMSampleBufferError_InvalidMediaTypeForOperation = -12741,
kCMSampleBufferError_InvalidSampleData = -12742,
kCMSampleBufferError_InvalidMediaFormat = -12743,
kCMSampleBufferError_Invalidated = -12744,
kCMSampleBufferError_DataFailed = -16750,
kCMSampleBufferError_DataCanceled = -16751,
Thanks to Olivia for this great and detailed post!
I recently started to program a streaming app on iPad Pro with Xamarin forms and this article helped a lot and I found many references to it throughout the web.
I suppose many people re-wrote Olivia's example in Xamarin already and I don't claim to be the best programmer in the world. But as nobody posted a C#/Xamarin version here yet and I would like to give something back to the community for the great post above, here is my C# / Xamarin version. Maybe it helps someone to to speed up progress in her or his project.
I kept close to Olivia's example, I even kept most of her comments.
First, for I prefer dealing with enums rather than numbers, I declared this NALU enum.
For the sake of completeness I also added some "exotic" NALU types I found on the internet:
public enum NALUnitType : byte
{
NALU_TYPE_UNKNOWN = 0,
NALU_TYPE_SLICE = 1,
NALU_TYPE_DPA = 2,
NALU_TYPE_DPB = 3,
NALU_TYPE_DPC = 4,
NALU_TYPE_IDR = 5,
NALU_TYPE_SEI = 6,
NALU_TYPE_SPS = 7,
NALU_TYPE_PPS = 8,
NALU_TYPE_AUD = 9,
NALU_TYPE_EOSEQ = 10,
NALU_TYPE_EOSTREAM = 11,
NALU_TYPE_FILL = 12,
NALU_TYPE_13 = 13,
NALU_TYPE_14 = 14,
NALU_TYPE_15 = 15,
NALU_TYPE_16 = 16,
NALU_TYPE_17 = 17,
NALU_TYPE_18 = 18,
NALU_TYPE_19 = 19,
NALU_TYPE_20 = 20,
NALU_TYPE_21 = 21,
NALU_TYPE_22 = 22,
NALU_TYPE_23 = 23,
NALU_TYPE_STAP_A = 24,
NALU_TYPE_STAP_B = 25,
NALU_TYPE_MTAP16 = 26,
NALU_TYPE_MTAP24 = 27,
NALU_TYPE_FU_A = 28,
NALU_TYPE_FU_B = 29,
}
More or less for convenience reasons I also defined an additional dictionary for the NALU descriptions:
public static Dictionary<NALUnitType, string> GetDescription { get; } =
new Dictionary<NALUnitType, string>()
{
{ NALUnitType.NALU_TYPE_UNKNOWN, "Unspecified (non-VCL)" },
{ NALUnitType.NALU_TYPE_SLICE, "Coded slice of a non-IDR picture (VCL) [P-frame]" },
{ NALUnitType.NALU_TYPE_DPA, "Coded slice data partition A (VCL)" },
{ NALUnitType.NALU_TYPE_DPB, "Coded slice data partition B (VCL)" },
{ NALUnitType.NALU_TYPE_DPC, "Coded slice data partition C (VCL)" },
{ NALUnitType.NALU_TYPE_IDR, "Coded slice of an IDR picture (VCL) [I-frame]" },
{ NALUnitType.NALU_TYPE_SEI, "Supplemental Enhancement Information [SEI] (non-VCL)" },
{ NALUnitType.NALU_TYPE_SPS, "Sequence Parameter Set [SPS] (non-VCL)" },
{ NALUnitType.NALU_TYPE_PPS, "Picture Parameter Set [PPS] (non-VCL)" },
{ NALUnitType.NALU_TYPE_AUD, "Access Unit Delimiter [AUD] (non-VCL)" },
{ NALUnitType.NALU_TYPE_EOSEQ, "End of Sequence (non-VCL)" },
{ NALUnitType.NALU_TYPE_EOSTREAM, "End of Stream (non-VCL)" },
{ NALUnitType.NALU_TYPE_FILL, "Filler data (non-VCL)" },
{ NALUnitType.NALU_TYPE_13, "Sequence Parameter Set Extension (non-VCL)" },
{ NALUnitType.NALU_TYPE_14, "Prefix NAL Unit (non-VCL)" },
{ NALUnitType.NALU_TYPE_15, "Subset Sequence Parameter Set (non-VCL)" },
{ NALUnitType.NALU_TYPE_16, "Reserved (non-VCL)" },
{ NALUnitType.NALU_TYPE_17, "Reserved (non-VCL)" },
{ NALUnitType.NALU_TYPE_18, "Reserved (non-VCL)" },
{ NALUnitType.NALU_TYPE_19, "Coded slice of an auxiliary coded picture without partitioning (non-VCL)" },
{ NALUnitType.NALU_TYPE_20, "Coded Slice Extension (non-VCL)" },
{ NALUnitType.NALU_TYPE_21, "Coded Slice Extension for Depth View Components (non-VCL)" },
{ NALUnitType.NALU_TYPE_22, "Reserved (non-VCL)" },
{ NALUnitType.NALU_TYPE_23, "Reserved (non-VCL)" },
{ NALUnitType.NALU_TYPE_STAP_A, "STAP-A Single-time Aggregation Packet (non-VCL)" },
{ NALUnitType.NALU_TYPE_STAP_B, "STAP-B Single-time Aggregation Packet (non-VCL)" },
{ NALUnitType.NALU_TYPE_MTAP16, "MTAP16 Multi-time Aggregation Packet (non-VCL)" },
{ NALUnitType.NALU_TYPE_MTAP24, "MTAP24 Multi-time Aggregation Packet (non-VCL)" },
{ NALUnitType.NALU_TYPE_FU_A, "FU-A Fragmentation Unit (non-VCL)" },
{ NALUnitType.NALU_TYPE_FU_B, "FU-B Fragmentation Unit (non-VCL)" }
};
Here comes my main decoding procedure. I assume the received frame as raw byte array:
public void Decode(byte[] frame)
{
uint frameSize = (uint)frame.Length;
SendDebugMessage($"Received frame of {frameSize} bytes.");
// I know how my H.264 data source's NALUs looks like so I know start code index is always 0.
// if you don't know where it starts, you can use a for loop similar to how I find the 2nd and 3rd start codes
uint firstStartCodeIndex = 0;
uint secondStartCodeIndex = 0;
uint thirdStartCodeIndex = 0;
// length of NALU start code in bytes.
// for h.264 the start code is 4 bytes and looks like this: 0 x 00 00 00 01
const uint naluHeaderLength = 4;
// check the first 8bits after the NALU start code, mask out bits 0-2, the NALU type ID is in bits 3-7
uint startNaluIndex = firstStartCodeIndex + naluHeaderLength;
byte startByte = frame[startNaluIndex];
int naluTypeId = startByte & 0x1F; // 0001 1111
NALUnitType naluType = (NALUnitType)naluTypeId;
SendDebugMessage($"1st Start Code Index: {firstStartCodeIndex}");
SendDebugMessage($"1st NALU Type: '{NALUnit.GetDescription[naluType]}' ({(int)naluType})");
// bits 1 and 2 are the NRI
int nalRefIdc = startByte & 0x60; // 0110 0000
SendDebugMessage($"1st NRI (NAL Ref Idc): {nalRefIdc}");
// IF the very first NALU type is an IDR -> handle it like a slice frame (-> re-cast it to type 1 [Slice])
if (naluType == NALUnitType.NALU_TYPE_IDR)
{
naluType = NALUnitType.NALU_TYPE_SLICE;
}
// if we haven't already set up our format description with our SPS PPS parameters,
// we can't process any frames except type 7 that has our parameters
if (naluType != NALUnitType.NALU_TYPE_SPS && this.FormatDescription == null)
{
SendDebugMessage("Video Error: Frame is not an I-Frame and format description is null.");
return;
}
// NALU type 7 is the SPS parameter NALU
if (naluType == NALUnitType.NALU_TYPE_SPS)
{
// find where the second PPS 4byte start code begins (0x00 00 00 01)
// from which we also get the length of the first SPS code
for (uint i = firstStartCodeIndex + naluHeaderLength; i < firstStartCodeIndex + 40; i++)
{
if (frame[i] == 0x00 && frame[i + 1] == 0x00 && frame[i + 2] == 0x00 && frame[i + 3] == 0x01)
{
secondStartCodeIndex = i;
this.SpsSize = secondStartCodeIndex; // includes the header in the size
SendDebugMessage($"2nd Start Code Index: {secondStartCodeIndex} -> SPS Size: {this.SpsSize}");
break;
}
}
// find what the second NALU type is
startByte = frame[secondStartCodeIndex + naluHeaderLength];
naluType = (NALUnitType)(startByte & 0x1F);
SendDebugMessage($"2nd NALU Type: '{NALUnit.GetDescription[naluType]}' ({(int)naluType})");
// bits 1 and 2 are the NRI
nalRefIdc = startByte & 0x60; // 0110 0000
SendDebugMessage($"2nd NRI (NAL Ref Idc): {nalRefIdc}");
}
// type 8 is the PPS parameter NALU
if (naluType == NALUnitType.NALU_TYPE_PPS)
{
// find where the NALU after this one starts so we know how long the PPS parameter is
for (uint i = this.SpsSize + naluHeaderLength; i < this.SpsSize + 30; i++)
{
if (frame[i] == 0x00 && frame[i + 1] == 0x00 && frame[i + 2] == 0x00 && frame[i + 3] == 0x01)
{
thirdStartCodeIndex = i;
this.PpsSize = thirdStartCodeIndex - this.SpsSize;
SendDebugMessage($"3rd Start Code Index: {thirdStartCodeIndex} -> PPS Size: {this.PpsSize}");
break;
}
}
// allocate enough data to fit the SPS and PPS parameters into our data objects.
// VTD doesn't want you to include the start code header (4 bytes long) so we subtract 4 here
byte[] sps = new byte[this.SpsSize - naluHeaderLength];
byte[] pps = new byte[this.PpsSize - naluHeaderLength];
// copy in the actual sps and pps values, again ignoring the 4 byte header
Array.Copy(frame, naluHeaderLength, sps, 0, sps.Length);
Array.Copy(frame, this.SpsSize + naluHeaderLength, pps,0, pps.Length);
// create video format description
List<byte[]> parameterSets = new List<byte[]> { sps, pps };
this.FormatDescription = CMVideoFormatDescription.FromH264ParameterSets(parameterSets, (int)naluHeaderLength, out CMFormatDescriptionError formatDescriptionError);
SendDebugMessage($"Creation of CMVideoFormatDescription: {((formatDescriptionError == CMFormatDescriptionError.None)? $"Successful! (Video Codec = {this.FormatDescription.VideoCodecType}, Dimension = {this.FormatDescription.Dimensions.Height} x {this.FormatDescription.Dimensions.Width}px, Type = {this.FormatDescription.MediaType})" : $"Failed ({formatDescriptionError})")}");
// re-create the decompression session whenever new PPS data was received
this.DecompressionSession = this.CreateDecompressionSession(this.FormatDescription);
// now lets handle the IDR frame that (should) come after the parameter sets
// I say "should" because that's how I expect my H264 stream to work, YMMV
startByte = frame[thirdStartCodeIndex + naluHeaderLength];
naluType = (NALUnitType)(startByte & 0x1F);
SendDebugMessage($"3rd NALU Type: '{NALUnit.GetDescription[naluType]}' ({(int)naluType})");
// bits 1 and 2 are the NRI
nalRefIdc = startByte & 0x60; // 0110 0000
SendDebugMessage($"3rd NRI (NAL Ref Idc): {nalRefIdc}");
}
// type 5 is an IDR frame NALU.
// The SPS and PPS NALUs should always be followed by an IDR (or IFrame) NALU, as far as I know.
if (naluType == NALUnitType.NALU_TYPE_IDR || naluType == NALUnitType.NALU_TYPE_SLICE)
{
// find the offset or where IDR frame NALU begins (after the SPS and PPS NALUs end)
uint offset = (naluType == NALUnitType.NALU_TYPE_SLICE)? 0 : this.SpsSize + this.PpsSize;
uint blockLength = frameSize - offset;
SendDebugMessage($"Block Length (NALU type '{naluType}'): {blockLength}");
var blockData = new byte[blockLength];
Array.Copy(frame, offset, blockData, 0, blockLength);
// write the size of the block length (IDR picture data) at the beginning of the IDR block.
// this means we replace the start code header (0 x 00 00 00 01) of the IDR NALU with the block size.
// AVCC format requires that you do this.
// This next block is very specific to my application and wasn't in Olivia's example:
// For my stream is encoded by NVIDEA NVEC I had to deal with additional 3-byte start codes within my IDR/SLICE frame.
// These start codes must be replaced by 4 byte start codes adding the block length as big endian.
// ======================================================================================================================================================
// find all 3 byte start code indices (0x00 00 01) within the block data (including the first 4 bytes of NALU header)
uint startCodeLength = 3;
List<uint> foundStartCodeIndices = new List<uint>();
for (uint i = 0; i < blockData.Length; i++)
{
if (blockData[i] == 0x00 && blockData[i + 1] == 0x00 && blockData[i + 2] == 0x01)
{
foundStartCodeIndices.Add(i);
byte naluByte = blockData[i + startCodeLength];
var tmpNaluType = (NALUnitType)(naluByte & 0x1F);
SendDebugMessage($"3-Byte Start Code (0x000001) found at index: {i} (NALU type {(int)tmpNaluType} '{NALUnit.GetDescription[tmpNaluType]}'");
}
}
// determine the byte length of each slice
uint totalLength = 0;
List<uint> sliceLengths = new List<uint>();
for (int i = 0; i < foundStartCodeIndices.Count; i++)
{
// for convenience only
bool isLastValue = (i == foundStartCodeIndices.Count-1);
// start-index to bit right after the start code
uint startIndex = foundStartCodeIndices[i] + startCodeLength;
// set end-index to bit right before beginning of next start code or end of frame
uint endIndex = isLastValue ? (uint) blockData.Length : foundStartCodeIndices[i + 1];
// now determine slice length including NALU header
uint sliceLength = (endIndex - startIndex) + naluHeaderLength;
// add length to list
sliceLengths.Add(sliceLength);
// sum up total length of all slices (including NALU header)
totalLength += sliceLength;
}
// Arrange slices like this:
// [4byte slice1 size][slice1 data][4byte slice2 size][slice2 data]...[4byte slice4 size][slice4 data]
// Replace 3-Byte Start Code with 4-Byte start code, then replace the 4-Byte start codes with the length of the following data block (big endian).
// https://stackoverflow.com/questions/65576349/nvidia-nvenc-media-foundation-encoded-h-264-frames-not-decoded-properly-using
byte[] finalBuffer = new byte[totalLength];
uint destinationIndex = 0;
// create a buffer for each slice and append it to the final block buffer
for (int i = 0; i < sliceLengths.Count; i++)
{
// create byte vector of size of current slice, add additional bytes for NALU start code length
byte[] sliceData = new byte[sliceLengths[i]];
// now copy the data of current slice into the byte vector,
// start reading data after the 3-byte start code
// start writing data after NALU start code,
uint sourceIndex = foundStartCodeIndices[i] + startCodeLength;
long dataLength = sliceLengths[i] - naluHeaderLength;
Array.Copy(blockData, sourceIndex, sliceData, naluHeaderLength, dataLength);
// replace the NALU start code with data length as big endian
byte[] sliceLengthInBytes = BitConverter.GetBytes(sliceLengths[i] - naluHeaderLength);
Array.Reverse(sliceLengthInBytes);
Array.Copy(sliceLengthInBytes, 0, sliceData, 0, naluHeaderLength);
// add the slice data to final buffer
Array.Copy(sliceData, 0, finalBuffer, destinationIndex, sliceData.Length);
destinationIndex += sliceLengths[i];
}
// ======================================================================================================================================================
// from here we are back on track with Olivia's code:
// now create block buffer from final byte[] buffer
CMBlockBufferFlags flags = CMBlockBufferFlags.AssureMemoryNow | CMBlockBufferFlags.AlwaysCopyData;
var finalBlockBuffer = CMBlockBuffer.FromMemoryBlock(finalBuffer, 0, flags, out CMBlockBufferError blockBufferError);
SendDebugMessage($"Creation of Final Block Buffer: {(blockBufferError == CMBlockBufferError.None ? "Successful!" : $"Failed ({blockBufferError})")}");
if (blockBufferError != CMBlockBufferError.None) return;
// now create the sample buffer
nuint[] sampleSizeArray = new nuint[] { totalLength };
CMSampleBuffer sampleBuffer = CMSampleBuffer.CreateReady(finalBlockBuffer, this.FormatDescription, 1, null, sampleSizeArray, out CMSampleBufferError sampleBufferError);
SendDebugMessage($"Creation of Final Sample Buffer: {(sampleBufferError == CMSampleBufferError.None ? "Successful!" : $"Failed ({sampleBufferError})")}");
if (sampleBufferError != CMSampleBufferError.None) return;
// if sample buffer was successfully created -> pass sample to decoder
// set sample attachments
CMSampleBufferAttachmentSettings[] attachments = sampleBuffer.GetSampleAttachments(true);
var attachmentSetting = attachments[0];
attachmentSetting.DisplayImmediately = true;
// enable async decoding
VTDecodeFrameFlags decodeFrameFlags = VTDecodeFrameFlags.EnableAsynchronousDecompression;
// add time stamp
var currentTime = DateTime.Now;
var currentTimePtr = new IntPtr(currentTime.Ticks);
// send the sample buffer to a VTDecompressionSession
var result = DecompressionSession.DecodeFrame(sampleBuffer, decodeFrameFlags, currentTimePtr, out VTDecodeInfoFlags decodeInfoFlags);
if (result == VTStatus.Ok)
{
SendDebugMessage($"Executing DecodeFrame(..): Successful! (Info: {decodeInfoFlags})");
}
else
{
NSError error = new NSError(CFErrorDomain.OSStatus, (int)result);
SendDebugMessage($"Executing DecodeFrame(..): Failed ({(VtStatusEx)result} [0x{(int)result:X8}] - {error}) - Info: {decodeInfoFlags}");
}
}
}
My function to create the decompression session looks like this:
private VTDecompressionSession CreateDecompressionSession(CMVideoFormatDescription formatDescription)
{
VTDecompressionSession.VTDecompressionOutputCallback callBackRecord = this.DecompressionSessionDecodeFrameCallback;
VTVideoDecoderSpecification decoderSpecification = new VTVideoDecoderSpecification
{
EnableHardwareAcceleratedVideoDecoder = true
};
CVPixelBufferAttributes destinationImageBufferAttributes = new CVPixelBufferAttributes();
try
{
var decompressionSession = VTDecompressionSession.Create(callBackRecord, formatDescription, decoderSpecification, destinationImageBufferAttributes);
SendDebugMessage("Video Decompression Session Creation: Successful!");
return decompressionSession;
}
catch (Exception e)
{
SendDebugMessage($"Video Decompression Session Creation: Failed ({e.Message})");
return null;
}
}
The decompression session callback routine:
private void DecompressionSessionDecodeFrameCallback(
IntPtr sourceFrame,
VTStatus status,
VTDecodeInfoFlags infoFlags,
CVImageBuffer imageBuffer,
CMTime presentationTimeStamp,
CMTime presentationDuration)
{
if (status != VTStatus.Ok)
{
NSError error = new NSError(CFErrorDomain.OSStatus, (int)status);
SendDebugMessage($"Decompression: Failed ({(VtStatusEx)status} [0x{(int)status:X8}] - {error})");
}
else
{
SendDebugMessage("Decompression: Successful!");
try
{
var image = GetImageFromImageBuffer(imageBuffer);
// In my application I do not use a display layer but send the decoded image directly by an event:
ImageSource imgSource = ImageSource.FromStream(() => image.AsPNG().AsStream());
OnImageFrameReady?.Invoke(imgSource);
}
catch (Exception e)
{
SendDebugMessage(e.ToString());
}
}
}
I use this function to convert the CVImageBuffer to an UIImage. It also refers to one of Olivia's posts mentioned above (how to convert a CVImageBufferRef to UIImage):
private UIImage GetImageFromImageBuffer(CVImageBuffer imageBuffer)
{
if (!(imageBuffer is CVPixelBuffer pixelBuffer)) return null;
var ciImage = CIImage.FromImageBuffer(pixelBuffer);
var temporaryContext = new CIContext();
var rect = CGRect.FromLTRB(0, 0, pixelBuffer.Width, pixelBuffer.Height);
CGImage cgImage = temporaryContext.CreateCGImage(ciImage, rect);
if (cgImage == null) return null;
var uiImage = UIImage.FromImage(cgImage);
cgImage.Dispose();
return uiImage;
}
Last but not least my tiny little function for debug output, feel free to pimp it as needed for your purpose ;-)
private void SendDebugMessage(string msg)
{
Debug.WriteLine($"VideoDecoder (iOS) - {msg}");
}
Finally, let's have a look at the namespaces used for the code above:
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Net;
using AvcLibrary;
using CoreFoundation;
using CoreGraphics;
using CoreImage;
using CoreMedia;
using CoreVideo;
using Foundation;
using UIKit;
using VideoToolbox;
using Xamarin.Forms;
#Livy to remove memory leaks before CMVideoFormatDescriptionCreateFromH264ParameterSets you should add the following:
if (_formatDesc) {
CFRelease(_formatDesc);
_formatDesc = NULL;
}
This post helped me a lot with sending H264 video from one device to another, but switching between devices caused the function receivedRawVideoFrame to not work correctly due to some changes in the frame data.
Here is my final function that decodes NAL units from the data directly, but doesn't rely on the order in the data frame
- (void)receivedRawVideoFrame:(NSData*)frameData {
NSUInteger frameSize = [frameData length];
const uint8_t * frame = [frameData bytes];
NSMutableDictionary* nalUnitsStart = [NSMutableDictionary dictionary];
NSMutableDictionary* nalUnitsEnd = [NSMutableDictionary dictionary];
uint8_t previousNalUnitType = 0;
for ( NSUInteger offset = 0; offset < frameSize - 4; offset++ ) {
// Find the start on NAL unit
if (frame[offset] == 0x00 && frame[offset+1] == 0x00 && frame[offset+2] == 0x00 && frame[offset+3] == 0x01) {
uint8_t nalType = frame[offset + 4] & 0x1F;
// Record the end of previous NAL unit
nalUnitsEnd[#(previousNalUnitType)] = #(offset);
previousNalUnitType = nalType;
nalUnitsStart[#(nalType)] = #(offset + 4);
}
}
// Record the end of the last NAL unit
nalUnitsEnd[#(previousNalUnitType)] = #(frameSize);
// Let's check if our data contains SPS && PPS NAL Units
NSNumber* spsOffset = nalUnitsStart[#(NAL_TYPE_SPS)];
NSNumber* ppsOffset = nalUnitsStart[#(NAL_TYPE_PPS)];
if ( spsOffset && ppsOffset ) {
NSNumber* spsEnd = nalUnitsEnd[#(NAL_TYPE_SPS)];
NSNumber* ppsEnd = nalUnitsEnd[#(NAL_TYPE_PPS)];
NSAssert(spsEnd && ppsEnd, #" [DECODE]: Missing the end of NAL unit(s)");
uint8_t *pps = NULL;
uint8_t *sps = NULL;
int spsSize = (int)(spsEnd.unsignedIntegerValue - spsOffset.unsignedIntegerValue);
int ppsSize = (int)(ppsEnd.unsignedIntegerValue - ppsOffset.unsignedIntegerValue);
// allocate enough data to fit the SPS and PPS parameters into our data objects.
// VTD doesn't want you to include the start code header (4 bytes long) so we add the - 4 here
sps = malloc(spsSize);
pps = malloc(ppsSize);
// copy in the actual sps and pps values, again ignoring the 4 byte header
memcpy(sps, &frame[spsOffset.unsignedIntegerValue], spsSize);
memcpy(pps, &frame[ppsOffset.unsignedIntegerValue], ppsSize);
// now we set our H264 parameters
uint8_t* parameterSetPointers[2] = {sps, pps};
size_t parameterSetSizes[2] = {spsSize, ppsSize};
OSStatus status = CMVideoFormatDescriptionCreateFromH264ParameterSets(kCFAllocatorDefault,
2,
(const uint8_t *const*)parameterSetPointers,
parameterSetSizes,
4,
&_formatDesc);
if (sps != NULL) free(sps);
if (pps != NULL) free(pps);
DebugAssert(status == noErr, #" [DECODE]: Failed to create CMVideoFormatDescription for H264");
if ( status != noErr ) {
NSLog(#" [DECODE]: Failed to create CMVideoFormatDescription for H264");
} else {
// Good place to re-create our decompression session
[self destroySession];
}
}
// Loop over all NAL units we have while ignoring everything with type < 5
for ( NSNumber* nalType in nalUnitsStart.allKeys ) {
if ( nalType.intValue > 5 ) {
continue;
}
// Get the header too (0x00000001), that will be replaced with the NAL unit size
NSNumber* nalStart = nalUnitsStart[nalType];
NSNumber* nalEnd = nalUnitsEnd[nalType];
size_t blockLength = nalEnd.unsignedIntegerValue - (nalStart.unsignedIntegerValue - sizeof(uint32_t));
uint8_t *data = malloc(blockLength);
memcpy(data, &frame[nalStart.unsignedIntegerValue - sizeof(uint32_t)], blockLength);
// replace the start code header on this NALU with its size.
// AVCC format requires that you do this.
// htonl converts the unsigned int from host to network byte order
uint32_t dataLength32 = htonl(blockLength - 4);
memcpy(data, &dataLength32, sizeof(uint32_t));
CMBlockBufferRef blockBuffer;
OSStatus status = CMBlockBufferCreateWithMemoryBlock(NULL,
data,
blockLength,
kCFAllocatorNull,
NULL,
0,
blockLength,
0,
&blockBuffer);
DebugAssert(status == noErr, #" [DECODE]: Failed to create CMBlockBufferRef for %#", nalType);
if ( status != noErr ) {
NSLog(#" [DECODE]: Failed to create CMBlockBufferRef for H264 for %#", nalType);
} else {
const size_t sampleSize = blockLength;
/* NOTE:
We are not responsible for releasing sample buffer,
it will be released by the decompress frame function
after it has been decoded!
*/
CMSampleBufferRef sampleBuffer;
status = CMSampleBufferCreate(kCFAllocatorDefault,
blockBuffer,
true,
NULL,
NULL,
_formatDesc,
1,
0,
NULL,
1,
&sampleSize,
&sampleBuffer);
DebugAssert(status == noErr, #" [DECODE]: Failed to create CMSampleBufferRef for %#", nalType);
if ( status != noErr ) {
NSLog(#" [DECODE]: Failed to create CMSampleBufferRef for H264 for %#", nalType);
if ( sampleBuffer ) {
CFRelease(sampleBuffer);
sampleBuffer = NULL;
}
} else {
// set some values of the sample buffer's attachments
CFArrayRef attachments = CMSampleBufferGetSampleAttachmentsArray(sampleBuffer, YES);
CFMutableDictionaryRef dict = (CFMutableDictionaryRef)CFArrayGetValueAtIndex(attachments, 0);
CFDictionarySetValue(dict, kCMSampleAttachmentKey_DisplayImmediately, kCFBooleanTrue);
[self decompressFrame:sampleBuffer];
}
}
if ( blockBuffer ) {
CFRelease(blockBuffer);
blockBuffer = NULL;
}
if ( data != NULL ) {
free(data);
data = NULL;
}
}
}
decompressFrame function is responsible for creating a new decompression session when it needs to based on the latest CMVideoFormatDescriptionRef data we got from our stream.