I am trying to store and retrieve a TensorflowMicro buffer array as an Arduino MBED SDRAM pointer - tensorflow

The Arduino core is on github as ArduinoCore-mbed
I have had some success storing a camera buffer into Arduino MBED SDRAM, but that was fairly easy as the code was expecting a pointer when working with the camera. See code below:
SDRAMClass mySDRAM;
uint8_t *sdram_frame_buffer;
//uint8_t frame_buffer[320*320];
void setup() {
mySDRAM.begin();
sdram_frame_buffer = (uint8_t *)mySDRAM.malloc(320 * 320 * sizeof(uint8_t));
then in the main loop I did
if (cam.grab(sdram_frame_buffer) == 0){...
Note: I also do some frame alignment on the above code, but it still kind of works fine.
But now I want to store the entire TensorflowMicro c++ array
const unsigned char model_tflite[] = { 0x74, 0x69, 0x74, ...};
unsigned int model_tflite_len = 2640;
//which is later called as an array
model = tflite::GetModel(model_tflite); // name from the tflite converter model.h file
The problem here is that the Portenta has a max array size <= 1MB which is much less than the SDRAM max array size 8MB. What would be best would be to put the data for the Machine Learning model directly into SDRAM without using an array at all. Not sure if there is an easy way to do that.
Anyone have any suggestions?
Here is an example that runs on the Arduino Portenta
// Needed for SDRAM assignment on Arduino Portenta
#include <SDRAM.h>
#define ALIGN_PTR(p,a) ((p & (a-1)) ?(((uintptr_t)p + a) & ~(uintptr_t)(a-1)) : p)
SDRAMClass mySDRAM;
// assign values to a regular array
unsigned char model_tflite[] = {0x54, 0x46, 0x4c, 0x33};
unsigned int model_tflite_len = 4;
// define SDRAM pointer
unsigned char *sdram_mem;
unsigned char *sdram_tflite; // 32-byte aligned
void setup() {
Serial.begin(115200);
mySDRAM.begin(SDRAM_START_ADDRESS);
// setup SDRAM memory block
sdram_mem = (unsigned char *) SDRAM.malloc(4 + 32 /*alignment*/);
sdram_tflite = (unsigned char *)ALIGN_PTR((uintptr_t)sdram_mem, 32);
// THE PROBLEM
// How to assign the data directly to the allocated pointer memory
// without having to make an array as well
// The following traditional line works
// sdram_tflite = model_tflite;
// This line doesn't work
// sdram_tflite = {0x54, 0x46, 0x4c, 0x33};
// The following works, but is very clumsy
*(sdram_tflite + 0) = 0x54;
*(sdram_tflite + 1) = 0x46;
*(sdram_tflite + 2) = 0x4c;
*(sdram_tflite + 3) = 0x33;
}
void myShowArray( unsigned char b[], int sizeOfArray ) {
for ( int k = 0 ; k < sizeOfArray ; k++ ){
Serial.println(b[k], HEX);
}
Serial.println();
}
void myShowPointer( unsigned char *b, int sizeOfArray ) {
for ( int k = 0 ; k < sizeOfArray ; k++ ){
Serial.println(*(b + k), HEX);
}
Serial.println();
}
void loop() {
Serial.println("Regular array");
for (int i=0; i < model_tflite_len; i++){
Serial.println(model_tflite[i], HEX);
}
Serial.println();
Serial.println("SDRAM pointer as an array");
for (int i=0; i < model_tflite_len; i++){
Serial.println( *(sdram_tflite + i), HEX );
}
Serial.println();
Serial.println("Regular array passed as an array to the receiving function");
myShowArray(model_tflite, model_tflite_len);
Serial.println("Pointer passed as a pointer to the receiving function");
myShowPointer(sdram_tflite, model_tflite_len);
Serial.println("Pointer passed as an array to the receiving function");
myShowArray(*&sdram_tflite, model_tflite_len);
Serial.println("--------------------");
Serial.println();
delay(4000);
}
with output :
Regular array
54
46
4C
33
SDRAM pointer as an array
54
46
4C
33
Regular array passed as an array to the receiving function
54
46
4C
33
Pointer passed as a pointer to the receiving function
54
46
4C
33
Pointer passed as an array to the receiving function
54
46
4C
33
--------------------

// Needed for SDRAM assignment on Arduino Portenta
#include <SDRAM.h>
#define ALIGN_PTR(p,a) ((p & (a-1)) ?(((uintptr_t)p + a) & ~(uintptr_t)(a-1)) : p)
SDRAMClass mySDRAM;
// assign values to a regular array
unsigned char model_tflite[] = {0x54, 0x46, 0x4c, 0x33};
unsigned int model_tflite_len = 4;
// define SDRAM pointer
unsigned char *sdram_mem;
unsigned char *sdram_tflite; // 32-byte aligned
void setup() {
Serial.begin(115200);
mySDRAM.begin(SDRAM_START_ADDRESS);
// setup SDRAM memory block
sdram_mem = (unsigned char *) SDRAM.malloc(4 + 32 /*alignment*/);
sdram_tflite = (unsigned char *)ALIGN_PTR((uintptr_t)sdram_mem, 32);
// THE PROBLEM
// How to assign the data directly to the allocated pointer memory
// without having to make an array as well
// The following traditional line works
// sdram_tflite = model_tflite;
// This line doesn't work
// sdram_tflite = {0x54, 0x46, 0x4c, 0x33};
// The following works, but is very clumsy
// *(sdram_tflite + 0) = 0x54;
// *(sdram_tflite + 1) = 0x46;
// *(sdram_tflite + 2) = 0x4c;
// *(sdram_tflite + 3) = 0x33;
// Try this
// unsigned char buffer[8];
// memcpy(buffer,"\x20\x38\x00\x00\x0E\x82\x00\x06",8);
// yes this works fine!
memcpy(sdram_tflite, "\x54\x46\x4c\x33", model_tflite_len);
}
void myShowArray( unsigned char b[], int sizeOfArray ) {
for ( int k = 0 ; k < sizeOfArray ; k++ ){
Serial.println(b[k], HEX);
}
Serial.println();
}
void myShowPointer( unsigned char *b, int sizeOfArray ) {
for ( int k = 0 ; k < sizeOfArray ; k++ ){
Serial.println(*(b + k), HEX);
}
Serial.println();
}
void loop() {
Serial.println("Regular array");
for (int i=0; i < model_tflite_len; i++){
Serial.println(model_tflite[i], HEX);
}
Serial.println();
Serial.println("SDRAM pointer as an array");
for (int i=0; i < model_tflite_len; i++){
Serial.println( *(sdram_tflite + i), HEX );
}
Serial.println();
Serial.println("Regular array passed as an array to the receiving function");
myShowArray(model_tflite, model_tflite_len);
Serial.println("Pointer passed as a pointer to the receiving function");
myShowPointer(sdram_tflite, model_tflite_len);
Serial.println("Pointer passed as an array to the receiving function");
// myShowArray(*&sdram_tflite, model_tflite_len);
myShowArray(sdram_tflite, model_tflite_len);
Serial.println("--------------------");
Serial.println();
delay(4000);
}

Related

Threads indexing out of bounds in CUDA kernel

I am running a CUDA kernel which seems to be indexing out of bounds and I can not figure out why. I get error 8 write-of-size in cuda-memcheck.
I have tried to change the number of blocks and the number of threads in each block as well as only running a fraction of all iterations needed. Here is some usefull information as well as a replicable example which gives the error:
blockSize: 128
numBlocks: 512
Nvidia GTX 970
#include <iostream>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector>
#include <iterator>
#include <cuda_profiler_api.h>
#include <algorithm>
#include <cmath>
#include <numeric>
#include <stdio.h>
#include <fstream>
__host__
int NchooseK(const int &N, const int &K)
{
int result = 1;
for (int i = 1; i <= K; i++)
{
result *= N - (K - i);
result /= i;
}
return result;
}
__host__
inline int get_flatten_size(const unsigned int N){
int sum = 0;
for(int i=1; i<=N ; i++){
sum +=i*NchooseK(N,i);
}
return sum;
}
__host__
std::vector<int> comb(const int &N, const int &K, const int &length)
//void comb(int N, int K, int length)
{
int k;
std::vector<int> vec(K);
std::vector<int> flatten_vec(0);
std::string bitmask(K, 1); // K leading 1's
bitmask.resize(N, 0); // N-K trailing 0's
for (int j = 0; j < length; j++) {
k = 0;
for (int i = 0; i < N; ++i) // [0..N-1] integers
{
if (bitmask[i]) {
//std::cout << i << " ";
vec[k] = i;
k++;
}
//std::cout << std::endl;
}
std::prev_permutation(bitmask.begin(), bitmask.end());
flatten_vec.insert(flatten_vec.end(), vec.begin(),vec.end());
}
return flatten_vec;
}
__host__
void get_matrix_indices(const unsigned int N, int *sub_col, int *sub_size, int *cumulative_size)
{
int size, itterator = 0;
cumulative_size[0] = 0;
std::vector<int> size_i_columns;
std::vector<int> all_columns(0);
for(int i=1; i<=N; i++){
size = NchooseK(N,i);
size_i_columns = comb(N,i,size);
for(int j=0; j<size; j++){
sub_size[itterator]=i;
cumulative_size[itterator+1]=cumulative_size[itterator]+i;
itterator++;
}
all_columns.insert(all_columns.end(),size_i_columns.begin(),size_i_columns.end());
}
//sub_col = &all_columns[0];
for(int i = 0; i < all_columns.size(); i++) sub_col[i] = all_columns[i];
}
__global__
void comb_ols(const unsigned int M, const unsigned int N, int* sub_col, int *sub_size, int* cumulative_size, const unsigned int numberOfCalculations, const unsigned int max_size){
int size;
int start_index;
int index = blockIdx.x*blockDim.x+threadIdx.x;
int stride = blockDim.x*gridDim.x;
double *sub_matrix = new double[M*(1+max_size)];
for(int i = index; i < numberOfCalculations; i+=stride){
size = sub_size[i];
start_index = cumulative_size[i];
for(int j = 0; j < size; j++){
for(int k = 0; k<M; k++){
sub_matrix[k] = 1;
}
}
}
delete [] sub_matrix;
}
And then we the main function:
int main()
{
int N = 17;
int M = 263;
const unsigned int regressors = N-1;
const unsigned int numberOfCalculations = (int) (exp2((double) regressors) - 1);
const unsigned int size_sub_col = get_flatten_size(regressors);
int blockSize =128;
int numBlocks = (numberOfCalculations + blockSize-1)/blockSize;
std::cout << "\nblockSize :" << blockSize;
std::cout << "\nnumBlocks :" << numBlocks;
std::cout << "\nblockSize*numBlocks :" << blockSize*numBlocks;
std::cout << "\nregressors :" << regressors;
std::cout << "\nNumberOfCalculations :" << numberOfCalculations;
std::cout << "\nsize_sub_col :" << size_sub_col << '\n' ;
int *sub_size, *cumulative_size, *sub_columns;
cudaMallocManaged(&sub_size, numberOfCalculations*sizeof(int));
cudaMallocManaged(&cumulative_size, (numberOfCalculations+1)*sizeof(int));
cudaMallocManaged(&sub_columns, size_sub_col*sizeof(int));
get_matrix_indices(regressors,sub_columns, sub_size, cumulative_size);
const unsigned int max_size = N*M;
cudaProfilerStart();
comb_ols<<<numBlocks, blockSize>>>(M,N,sub_columns, sub_size, cumulative_size, numberOfCalculations, max_size);
cudaProfilerStop();
cudaDeviceSynchronize();
cudaFree(sub_size);
cudaFree(cumulative_size);
cudaFree(sub_columns);
return 0;
}
I fail to see why the threads would try to access illegal memory space. The way I understood is that the matrix sub_matrix will be initilized on each thread once and then the parallel for loop happens. Thus should each thread have the necessary memory space. Am I allocating too much memory on the GPU? How is "new sub_matrix" handled here?
If I read your code correctly, each thread is attempting to allocate M * (1 + M*N) doubles, which is 263 * ( 1 + 263*17) = ‭1,176,136‬ doubles, or 8.97Mb of heap memory per thread. You launch 128 * 512 threads. That would mean you require 588Gb of heap space for the kernel to run successfully.
Clearly your GPU lacks that amount of memory and the out of bounds memory access comes from failures in the new call (which you can check for, BTW).
Might I suggest that something in the size calculations for the heap memory you require is wrong. Otherwise you have an extremely unrealistic problem for the GPU and will require some other approach.
Note that even if you manage to redesign things to limit the code to a feasible malloc heap memory size, you will still need, in all likelihood, to resize the malloc heap to a suitable size before running the kernel. The cudaDeviceSetLimit API can be used for this.

GNU Radio circular buffer manipulation

I encountered the following error
gr::log :WARN: tpb_thread_body - asynchronous message buffer overflowing, dropping message
Out of serendipity, I ran into this GNU Radio presentation on
Youtube.
The presenter mentioned an OOT block he called "buffer" that is capable of eliminating the "buffer overflowing" error. Apparently, this block plays with different sample rates and the so-called "circular buffers". I haven't worked with circular buffers myself. Any ideas on circular buffers or any hints on how to build this buffer block are welcome.
EDIT
Below is the flowgraph that generates the error. As it was suggested in the comments, the culprits could be the message processing blocks (red-circled) namely generateCADU (for generating standard CCSDS frames) and processCADU (for extracting CADUs from a data stream).
The implementation file of the generateCADU block is given below
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <gnuradio/io_signature.h>
#include "generateCADU_impl.h"
#include "fec/ReedSolomon/ReedSolomon.h"
#include "fec/Scrambler/Scrambler.h"
namespace gr {
namespace ccsds {
generateCADU::sptr
generateCADU::make(int frameLength,std::string sync, int scramble, int rs, int intDepth)
{
return gnuradio::get_initial_sptr
(new generateCADU_impl(frameLength, sync, scramble, rs, intDepth));
}
/*
* The private constructor
*/
generateCADU_impl::generateCADU_impl(int frameLength,std::string sync, int scramble, int rs, int intDepth)
: gr::sync_block("generateCADU",
gr::io_signature::make(1, 1, sizeof(unsigned char)),
gr::io_signature::make(0, 0, 0)),
d_frameLength(frameLength),d_scramble(scramble == 1),d_rs(rs >= 1), d_basis(rs >= 2), d_intDepth(intDepth)
{
set_output_multiple(d_frameLength);
//Registering output port
message_port_register_out(pmt::mp("out"));
d_sync = parse_string(sync);
}
/*
* Our virtual destructor.
*/
generateCADU_impl::~generateCADU_impl()
{
}
unsigned char
generateCADU_impl::parse_hex(char c)
{
if ('0' <= c && c <= '9') return c - '0';
if ('A' <= c && c <= 'F') return c - 'A' + 10;
if ('a' <= c && c <= 'f') return c - 'a' + 10;
std::abort();
}
std::vector<unsigned char>
generateCADU_impl::parse_string(const std::string & s)
{
if (s.size() % 2 != 0) std::abort();
std::vector<unsigned char> result(s.size() / 2);
for (std::size_t i = 0; i != s.size() / 2; ++i)
result[i] = 16 * parse_hex(s[2 * i]) + parse_hex(s[2 * i + 1]);
return result;
}
int
generateCADU_impl::work(int noutput_items,
gr_vector_const_void_star &input_items,
gr_vector_void_star &output_items)
{
const unsigned char *in = (const unsigned char *) input_items[0];
//Reed-Solomon and Scrambler objects
ReedSolomon RS(16,d_intDepth,d_basis);// False = conventional, True = dual-basis
Scrambler S;
//Buffers
unsigned char *frameBuffer1 = (unsigned char*)malloc(d_frameLength*sizeof(unsigned char));
std::vector<unsigned char> frameBuffer2;
//The work function engine
for(int i = 0; (i + d_frameLength) < noutput_items; i += d_frameLength)
{
//Copying data from input stream
memcpy(frameBuffer1,in + i + d_frameLength,d_frameLength);
//Copying frame into std::vector buffer
frameBuffer2.insert(frameBuffer2.begin(),frameBuffer1, frameBuffer1 + d_frameLength);
//Optional scrambling and Reed-Solomon
if (d_rs) RS.Encode_RS(frameBuffer2);
if (d_scramble) S.Scramble(frameBuffer2);
//Insert sync word
frameBuffer2.insert(frameBuffer2.begin(), d_sync.begin(), d_sync.end());
//Transmitting PDU
pmt::pmt_t pdu(pmt::cons(pmt::PMT_NIL,pmt::make_blob(frameBuffer2.data(),frameBuffer2.size())));
message_port_pub(pmt::mp("out"), pdu);
//Clear buffer
frameBuffer2.clear();
}
// Tell runtime system how many output items we produced.
return noutput_items;
}
} /* namespace ccsds */
} /* namespace gr */
And here is the processCADU block. This block uses tags generated by the synchronizeCADU (which is simply a wrapper for the correlate_access_tag block) to extract CADUs
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <gnuradio/io_signature.h>
#include "processCADU_impl.h"
#include "fec/ReedSolomon/ReedSolomon.h"
#include "fec/Scrambler/Scrambler.h"
namespace gr {
namespace ccsds {
processCADU::sptr
processCADU::make(int frameLength, int scramble, int rs, int intDepth, std::string tagName)
{
return gnuradio::get_initial_sptr
(new processCADU_impl(frameLength, scramble, rs, intDepth, tagName));
}
/*
* The private constructor
*/
processCADU_impl::processCADU_impl(int frameLength, int scramble, int rs, int intDepth, std::string tagName)
: gr::sync_block("processCADU",
gr::io_signature::make(1, 1, sizeof(unsigned char)),
gr::io_signature::make(0, 0, 0)),
d_frameLength(frameLength),d_scramble(scramble == 1),d_rs(rs >= 1), d_basis(rs >= 2), d_intDepth(intDepth)
{
//Multiple input
set_output_multiple(d_frameLength * 8);
//Registering output port
message_port_register_out(pmt::mp("out"));
if (d_rs) d_frameLength += 32 * d_intDepth;
//SEtting tag name
key = pmt::mp(tagName);
}
/*
* Our virtual destructor.
*/
processCADU_impl::~processCADU_impl()
{
delete d_pack;
}
int
processCADU_impl::work(int noutput_items,
gr_vector_const_void_star &input_items,
gr_vector_void_star &output_items)
{
const unsigned char *in = (const unsigned char *) input_items[0];
unsigned char *out = (unsigned char *) output_items[0];
void *msg_data = NULL;
unsigned char frame_data[d_frameLength];
unsigned char frame_len = 0;
std::vector<unsigned char> frameBuffer;
//Reed-Solomon and Scrambler objects
ReedSolomon RS(16,d_intDepth,d_basis);// False = conventional, True = dual-basis
std::vector<int> errors;//errors.push_back(0);
Scrambler S;
d_tags.clear();
d_pack = new blocks::kernel::pack_k_bits(8);
this->get_tags_in_window(d_tags, 0, 0, noutput_items,key);
for(d_tags_itr = d_tags.begin(); d_tags_itr != d_tags.end(); d_tags_itr++) {
// Check that we have enough data for a full frame
if ((d_tags_itr->offset - this->nitems_read(0)) > (noutput_items - (d_frameLength) * 8))
{
return (d_tags_itr->offset - this->nitems_read(0) - 1);
}
//Pack bits into bytes
d_pack->pack(frame_data, &in[d_tags_itr->offset - this->nitems_read(0)], d_frameLength);
//Copying frame into std::vector buffer
frameBuffer.insert(frameBuffer.begin(),frame_data, frame_data + d_frameLength);
//Optional scrambling and Reed-Solomon
if (d_scramble) S.Scramble(frameBuffer);
//if (d_rs) RS.Decode_RS(frameBuffer,errors);
//If there is Reed-Solomon decoding
if(d_rs)
{
RS.Decode_RS(frameBuffer,errors);
if (RS.Success(errors)) // Success
{
//std::cout << "Success" << std::endl;
pmt::pmt_t pdu(pmt::cons(pmt::PMT_NIL,pmt::make_blob(frameBuffer.data(),frameBuffer.size())));
message_port_pub(pmt::mp("out"), pdu);
/*for(int i=0; i < errors.size(); i++)
{
//std::cout << "Number of Errors : " << errors.at(i) << std::endl << std::endl;
}*/
}
else // Failure
{
std::cout << "RS failure" << std::endl;
}
}
else{
pmt::pmt_t pdu(pmt::cons(pmt::PMT_NIL,pmt::make_blob(frameBuffer.data(),frameBuffer.size())));
message_port_pub(pmt::mp("out"), pdu);
}
//Clear buffers
frameBuffer.clear();
errors.clear();
}
// Tell runtime system how many output items we produced.
return noutput_items;
}
} /* namespace ccsds */
} /* namespace gr */
Regards,
M
Thanks to #MarcusMüller suggestion, using the tagged_stream paradigma as opposed to PDUs solved the problem. I was able to transmit 47 terabytes of data without any problems. Below is the code for the newly implemented block.
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <gnuradio/io_signature.h>
#include "genCADU_impl.h"
namespace gr {
namespace ccsds {
genCADU::sptr
genCADU::make(int frameLength,std::string sync, int scramble, int rs, int intDepth, std::string len_tag_key)
{
return gnuradio::get_initial_sptr
(new genCADU_impl(frameLength, sync, scramble, rs, intDepth, len_tag_key));
}
/*
* The private constructor
*/
genCADU_impl::genCADU_impl(int frameLength,std::string sync, int scramble, int rs, int intDepth, std::string len_tag_key)
: gr::tagged_stream_block("genCADU",
gr::io_signature::make(1, 1, sizeof(unsigned char)),
gr::io_signature::make(1, 1, sizeof(unsigned char)),len_tag_key),
d_frameLength(frameLength),d_scramble(scramble == 1),d_rs(rs >= 1), d_basis(rs >= 2), d_intDepth(intDepth)
{
//Synchronization pattern
d_sync = parse_string(sync);
//Reed-Solomon and Scrambler objects
RS = new ReedSolomon(16,d_intDepth,d_basis);// False = conventional, True = dual-basis
S = new Scrambler();
}
/*
* Our virtual destructor.
*/
genCADU_impl::~genCADU_impl()
{
delete RS;
delete S;
}
int
genCADU_impl::calculate_output_stream_length(const gr_vector_int &ninput_items)
{
int noutput_items = (d_rs) ? d_frameLength + 32*d_intDepth + d_sync.size() : d_frameLength + d_sync.size();
return noutput_items ;
}
unsigned char
genCADU_impl::parse_hex(char c)
{
if ('0' <= c && c <= '9') return c - '0';
if ('A' <= c && c <= 'F') return c - 'A' + 10;
if ('a' <= c && c <= 'f') return c - 'a' + 10;
std::abort();
}
std::vector<unsigned char>
genCADU_impl::parse_string(const std::string & s)
{
if (s.size() % 2 != 0) std::abort();
std::vector<unsigned char> result(s.size() / 2);
for (std::size_t i = 0; i != s.size() / 2; ++i)
result[i] = 16 * parse_hex(s[2 * i]) + parse_hex(s[2 * i + 1]);
return result;
}
int
genCADU_impl::work (int noutput_items,
gr_vector_int &ninput_items,
gr_vector_const_void_star &input_items,
gr_vector_void_star &output_items)
{
const unsigned char *in = (const unsigned char *) input_items[0];
unsigned char *out = (unsigned char *) output_items[0];
int total_len;
//Copy pdu from circular buffer to local buffer
buffer.insert(buffer.end(), in, in + d_frameLength);
//Optional scrambling and Reed-Solomon. TO DO: Turbo and LDPC
if (d_rs) RS->Encode_RS(buffer);
if (d_scramble) S->Scramble(buffer);
//Insert sync word
buffer.insert(buffer.begin(), d_sync.begin(), d_sync.end());
//Copy from local buffer to circular buffer
std::copy(buffer.begin(),buffer.end(),out);
//Clear the local buffer
total_len = buffer.size();
buffer.clear();
// Tell runtime system how many output items we produced.
return total_len;
}
} /* namespace ccsds */
} /* namespace gr */
Regards,
M.

Constructing bitmask ? bitwise packet

I have been wanting to experiment with this project Axon with an iOS app connecting over a tcp connection. Towards the end of the doc the protocol is explained as so
The wire protocol is simple and very much zeromq-like, where is a BE 24 bit unsigned integer representing a maximum length of roughly ~16mb. The data byte is currently only used to store the codec, for example "json" is simply 1, in turn JSON messages received on the client end will then be automatically decoded for you by selecting this same codec.
With the diagram
octet: 0 1 2 3 <length>
+------+------+------+------+------------------...
| meta | <length> | data ...
+------+------+------+------+------------------...
I have had experience working with binary protocols creating a packet such as:
NSUInteger INT_32_LENGTH = sizeof(uint32_t);
uint32_t length = [data length]; // data is an NSData object
NSMutableData *packetData = [NSMutableData dataWithCapacity:length + (INT_32_LENGTH * 2)];
[packetData appendBytes:&requestType length:INT_32_LENGTH];
[packetData appendBytes:&length length:INT_32_LENGTH];
[packetData appendData:data];
So my question is how would you create the data packet for the Axon request, I would assume some bit shifting, which I am not too clued up on.
Allocate 1 array of char or unsigned char with size == packet_size;
Decalre constants:
const int metaFieldPos = 0;
const int sizeofMetaField = sizeof(char);
const int lengthPos = metaFieldPos + sizeofMetaField;
const int sizeofLengthField = sizeof(char) * 3;
const int dataPos = lengthPos + sizeofLengthField;
If you got the data and can recognize begining of the packet, you can use constants above to
navigate by pointers.
May be these functions will help you (They use Qt, but you can easily translate them to library, that you use)
quint32 Convert::uint32_to_uint24(const quint32 value){
return value & (quint32)(0x00FFFFFFu);
}
qint32 Convert::int32_to_uint24(const qint32 value){
return value & (qint32)(0x00FFFFFF);
}
quint32 Convert::bytes_to_uint24(const char* from){
quint32 result = 0;
quint8 shift = 0;
for (int i = 0; i < bytesIn24Bits; i++) {
result |= static_cast<quint32>(*reinterpret_cast<const quint8 *>(from + i)) << shift;
shift+=bitsInByte;
}
return result;
}
void Convert::uint32_to_uint24Bytes(const quint32 value, char* from){
quint8 shift = 0;
for (int i = 0; i < bytesIn24Bits; i++) {
const quint32 buf = (value >> shift) & 0xFFu;
*(from + i) = *reinterpret_cast<const char *>(&buf);
shift+=bitsInByte;
}
}
QByteArray Convert::uint32_to_uint24QByteArray (const quint32 value){
QByteArray bytes;
bytes.resize(sizeof(value));
*reinterpret_cast<quint32 *>(bytes.data()) = value;
bytes.chop(1);
return bytes;
}

Convert decimal to binary and return array

probably there is a smart way to do that , but anyway i get error on this :
-(int*)decimalBinary:(int)decimal
{
int i=0;
int *bin;
while (decimal!=0)
{
bin[i]=decimal%2;
decimal=decimal/2;
i++;
}
return bin;
}
on the modulo line . why ?
And whats the better way to get it to array ?
Declaring
int *bin;
sets aside space for a pointer but doesn't make it point to an object. It is crucial to initialize bin before using it.
To solve your problem you can declare an array bin[4] in caller function (int main) and then pass *bin to your calling function.
The following code is adapted from This answer on how to print an integer in binary format. Storing "binary digits" into an int array is added into the code below:
#include <stdio.h> /* printf */
#include <stdlib.h> /* strtol */
const char *byte_to_binary(long x);
int main(void)
{
long lVal;
int i, len, array[18];
char buf[18];
{ /* binary string to int */
char *tmp;
char *b = "11010111001010110";
lVal=strtol(b, &tmp, 2); //convert string in "base 2" format to long int
printf("%d\n", lVal);
}
{
printf("%s", byte_to_binary(lVal));
/* byte to binary string */
sprintf(buf,"%s", byte_to_binary(lVal));
}
len = strlen(buf);
for(i=0;i<len;i++)
{ //store binary digits into an array.
array[i] = (buf[i]-'0');
}
getchar();
return 0;
}
const char *byte_to_binary(long x)
{
static char b[17]; //16 bits plus '\0'
b[0] = '\0';
char *p = b;
int z;
for (z = 65536; z > 0; z >>= 1) //2^16
{
*p++ = (x & z) ? '1' : '0';
}
return b;
}

How to calculate CRC-16 from HEX values?

In my code i need to calculate CRC-16 16 bit values for the HEX values stored as NSdata, below is the code snippet to calculate CRC-16 in c.
void UpdateCRC(unsigned short int *CRC, unsigned char x)
{
// This function uses the initial CRC value passed in the first
// argument, then modifies it using the single character passed
// as the second argument, according to a CRC-16 polynomial
// Arguments:
// CRC -- pointer to starting CRC value
// x -- new character to be processed
// Returns:
// The function does not return any values, but updates the variable
// pointed to by CRC
static int const Poly = 0xA001;
int i;
bool flag;
*CRC ^= x;
for (i=0; i<8; i++)
// CRC-16 polynomial
{
flag = ((*CRC & 1) == 1);
*CRC = (unsigned short int)(*CRC >> 1);
if (flag)
*CRC ^= Poly;
}
return;
}
NSdata which holds the hex values like below
const char connectByteArray[] = {
0x21,0x01,0x90,0x80,0x5F
};
NSData* data = [NSData dataWithBytes: connectByteArray length:sizeof(connectByteArray)];
I solved using the following C program, I hope it may help someone ..cheers!!!
#include <string.h>
#include <stdio.h>
const int order = 16;
const unsigned long polynom = 0x8005;
const int direct = 1;
const unsigned long crcinit = 0;
const unsigned long crcxor = 0;
const int refin = 1;
const int refout = 1;
// 'order' [1..32] is the CRC polynom order, counted without the leading '1' bit
// 'polynom' is the CRC polynom without leading '1' bit
// 'direct' [0,1] specifies the kind of algorithm: 1=direct, no augmented zero bits
// 'crcinit' is the initial CRC value belonging to that algorithm
// 'crcxor' is the final XOR value
// 'refin' [0,1] specifies if a data byte is reflected before processing (UART) or not
// 'refout' [0,1] specifies if the CRC will be reflected before XOR
// Data character string
const unsigned char string[] = {0x05,0x0f,0x01,0x00,0x00,0x99};
// internal global values:
unsigned long crcmask;
unsigned long crchighbit;
unsigned long crcinit_direct;
unsigned long crcinit_nondirect;
unsigned long crctab[256];
// subroutines
unsigned long reflect (unsigned long crc, int bitnum) {
// reflects the lower 'bitnum' bits of 'crc'
unsigned long i, j=1, crcout=0;
for (i=(unsigned long)1<<(bitnum-1); i; i>>=1) {
if (crc & i) crcout|=j;
j<<= 1;
}
return (crcout);
}
void generate_crc_table() {
// make CRC lookup table used by table algorithms
int i, j;
unsigned long bit, crc;
for (i=0; i<256; i++) {
crc=(unsigned long)i;
if (refin) crc=reflect(crc, 8);
crc<<= order-8;
for (j=0; j<8; j++) {
bit = crc & crchighbit;
crc<<= 1;
if (bit) crc^= polynom;
}
if (refin) crc = reflect(crc, order);
crc&= crcmask;
crctab[i]= crc;
}
}
unsigned long crctablefast (unsigned char* p, unsigned long len) {
// fast lookup table algorithm without augmented zero bytes, e.g. used in pkzip.
// only usable with polynom orders of 8, 16, 24 or 32.
unsigned long crc = crcinit_direct;
if (refin) crc = reflect(crc, order);
if (!refin) while (len--) crc = (crc << 8) ^ crctab[ ((crc >> (order-8)) & 0xff) ^ *p++];
else while (len--) crc = (crc >> 8) ^ crctab[ (crc & 0xff) ^ *p++];
if (refout^refin) crc = reflect(crc, order);
crc^= crcxor;
crc&= crcmask;
return(crc);
}
unsigned long crctable (unsigned char* p, unsigned long len) {
// normal lookup table algorithm with augmented zero bytes.
// only usable with polynom orders of 8, 16, 24 or 32.
unsigned long crc = crcinit_nondirect;
if (refin) crc = reflect(crc, order);
if (!refin) while (len--) crc = ((crc << 8) | *p++) ^ crctab[ (crc >> (order-8)) & 0xff];
else while (len--) crc = ((crc >> 8) | (*p++ << (order-8))) ^ crctab[ crc & 0xff];
if (!refin) while (++len < order/8) crc = (crc << 8) ^ crctab[ (crc >> (order-8)) & 0xff];
else while (++len < order/8) crc = (crc >> 8) ^ crctab[crc & 0xff];
if (refout^refin) crc = reflect(crc, order);
crc^= crcxor;
crc&= crcmask;
return(crc);
}
unsigned long crcbitbybit(unsigned char* p, unsigned long len) {
// bit by bit algorithm with augmented zero bytes.
// does not use lookup table, suited for polynom orders between 1...32.
unsigned long i, j, c, bit;
unsigned long crc = crcinit_nondirect;
for (i=0; i<len; i++) {
c = (unsigned long)*p++;
if (refin) c = reflect(c, 8);
for (j=0x80; j; j>>=1) {
bit = crc & crchighbit;
crc<<= 1;
if (c & j) crc|= 1;
if (bit) crc^= polynom;
}
}
for (i=0; i<order; i++) {
bit = crc & crchighbit;
crc<<= 1;
if (bit) crc^= polynom;
}
if (refout) crc=reflect(crc, order);
crc^= crcxor;
crc&= crcmask;
return(crc);
}
unsigned long crcbitbybitfast(unsigned char* p, unsigned long len) {
// fast bit by bit algorithm without augmented zero bytes.
// does not use lookup table, suited for polynom orders between 1...32.
unsigned long i, j, c, bit;
unsigned long crc = crcinit_direct;
for (i=0; i<len; i++) {
c = (unsigned long)*p++;
if (refin) c = reflect(c, 8);
for (j=0x80; j; j>>=1) {
bit = crc & crchighbit;
crc<<= 1;
if (c & j) bit^= crchighbit;
if (bit) crc^= polynom;
}
}
if (refout) crc=reflect(crc, order);
crc^= crcxor;
crc&= crcmask;
return(crc);
}
int main() {
// test program for checking four different CRC computing types that are:
// crcbit(), crcbitfast(), crctable() and crctablefast(), see above.
// parameters are at the top of this program.
// Result will be printed on the console.
int i;
unsigned long bit, crc;
// at first, compute constant bit masks for whole CRC and CRC high bit
crcmask = ((((unsigned long)1<<(order-1))-1)<<1)|1;
crchighbit = (unsigned long)1<<(order-1);
// check parameters
if (order < 1 || order > 32) {
printf("ERROR, invalid order, it must be between 1..32.\n");
return(0);
}
if (polynom != (polynom & crcmask)) {
printf("ERROR, invalid polynom.\n");
return(0);
}
if (crcinit != (crcinit & crcmask)) {
printf("ERROR, invalid crcinit.\n");
return(0);
}
if (crcxor != (crcxor & crcmask)) {
printf("ERROR, invalid crcxor.\n");
return(0);
}
// generate lookup table
generate_crc_table();
// compute missing initial CRC value
if (!direct) {
crcinit_nondirect = crcinit;
crc = crcinit;
for (i=0; i<order; i++) {
bit = crc & crchighbit;
crc<<= 1;
if (bit) crc^= polynom;
}
crc&= crcmask;
crcinit_direct = crc;
}
else {
crcinit_direct = crcinit;
crc = crcinit;
for (i=0; i<order; i++) {
bit = crc & 1;
if (bit) crc^= polynom;
crc >>= 1;
if (bit) crc|= crchighbit;
}
crcinit_nondirect = crc;
}
// call CRC algorithms using the CRC parameters above and print result to the console
printf("\n");
printf("CRC tester v1.1 written on 13/01/2003 by Sven Reifegerste (zorc/reflex)\n");
printf("-----------------------------------------------------------------------\n");
printf("\n");
printf("Parameters:\n");
printf("\n");
printf(" polynom : 0x%x\n", polynom);
printf(" order : %d\n", order);
printf(" crcinit : 0x%x direct, 0x%x nondirect\n", crcinit_direct, crcinit_nondirect);
printf(" crcxor : 0x%x\n", crcxor);
printf(" refin : %d\n", refin);
printf(" refout : %d\n", refout);
printf("\n");
printf(" data string : '%s' (%d bytes)\n", string, strlen(string));
printf("\n");
printf("Results:\n");
printf("\n");
printf(" crc bit by bit : 0x%x\n", crcbitbybit((unsigned char *)string, 6));
printf(" crc bit by bit fast : 0x%x\n", crcbitbybitfast((unsigned char *)string, strlen(string)));
if (!(order&7)) printf(" crc table : 0x%x\n", crctable((unsigned char *)string, strlen(string)));
if (!(order&7)) printf(" crc table fast : 0x%x\n", crctablefast((unsigned char *)string, strlen(string)));
return(0);
}