Degraded performance of CGAL::intersection between a 32 bit and 64 bit application while multi-threading

Degraded performance of CGAL::intersection between a 32 bit and 64 bit application while multi-threading - cgal

We have recently changed our application from 32 bit to 64 bit and have noticed a degradation in performance when using CGAL::intersection in multiple worker threads. Each thread is using a distinct set of shapes, and so do not interfere with each other.
Here is a simple example I put together to illustrate the issue:
#include <iostream>
#include <vector>
#include <thread>
#include <CGAL/Boolean_set_operations_2.h>
#include <CGAL/Exact_predicates_exact_constructions_kernel.h>
#include <CGAL/Gps_circle_segment_traits_2.h>
typedef CGAL::Exact_predicates_exact_constructions_kernel Kernel;
typedef CGAL::Gps_circle_segment_traits_2<Kernel> Traits_2;
typedef Kernel::Point_2 Point_2;
typedef Traits_2::General_polygon_2 Polygon_2;
typedef Traits_2::General_polygon_with_holes_2 Polygon_with_holes_2;
typedef Traits_2::X_monotone_curve_2 X_monotone_curve_2;
const bool multiThreading = true;
const int threadCount = 7;
const int attempts = 2520;
const int overallAttempts = 10;
Polygon_2 construct_polygon(const Point_2& p1, const Point_2& p2, const Point_2& p3, const Point_2& p4, const Point_2& p5)
{
Polygon_2 pgn;
X_monotone_curve_2 s1(p1, p2); pgn.push_back(s1);
X_monotone_curve_2 s2(p2, p3); pgn.push_back(s2);
X_monotone_curve_2 s3(p3, p4); pgn.push_back(s3);
X_monotone_curve_2 s4(p4, p5); pgn.push_back(s4);
X_monotone_curve_2 s5(p5, p1); pgn.push_back(s5);
return pgn;
}
void do_intersection()
{
Polygon_2 outerPolygon = construct_polygon(Point_2(0, 0), Point_2(300, 0), Point_2(310, 150), Point_2(300, 300), Point_2(0, 300));
int numHoles = 100;
std::vector<Polygon_2> holes(numHoles);
for (int i = 0; i < numHoles; i++)
{
holes[i] = construct_polygon(Point_2(2 * i + 1, 2 * i + 1), Point_2(2 * i + 1, 2 * i + 1.5), Point_2(2 * i + 2, 2 * i + 2), Point_2(2 * i + 5, 2 * i + 2), Point_2(2 * i + 3, 2 * i + 1));
}
Polygon_with_holes_2 p(outerPolygon, holes.begin(), holes.end());
Polygon_2 poly1 = construct_polygon(Point_2(1, 0), Point_2(50, 1), Point_2(12, 13), Point_2(25, 50), Point_2(12, 100));
int intersectAttempts = multiThreading ? attempts / threadCount : attempts;
for (int i = 0; i < intersectAttempts; i++)
{
std::list<Polygon_with_holes_2> intersect;
CGAL::intersection(p, poly1, std::back_inserter(intersect));
}
}
int main(int argc, char* argv[])
{
long long averageTime = 0;
for (int x = 0; x < overallAttempts; x++)
{
auto startTime = std::chrono::high_resolution_clock::now();
if (multiThreading)
{
std::thread threads[threadCount];
for (int i = 0; i < threadCount; i++)
{
threads[i] = std::thread(do_intersection);
}
for (int i = 0; i < threadCount; i++)
{
threads[i].join();
}
}
else
{
do_intersection();
}
auto endTime = std::chrono::high_resolution_clock::now();
auto diffTime = endTime - startTime;
averageTime = averageTime + diffTime.count();
std::cout << "Total cost: " << diffTime.count() * 1e-9 << std::endl;
}
std::cout << std::endl;
std::cout << "Average cost: " << (averageTime * 1e-9)/overallAttempts << std::endl;
return 0;
}
When using a single thread, this example is roughly 15% faster in x64 than in x86. However when using 7 worker threads (my machine has 8 logical processors), this example is roughly 50% slower in x64 than x86.
We are unsure why this is considerably slower, but i think it may be related to using CGAL::intersection on a polygon with a lot of holes. Any helpful suggestions would be greatly appreciated.

Related

Comparing Execution time with Time Complexity in Merge & Quick Sort

I have implemented Merge & Quick Sort in the textbook what I've learned, and it says Time Complexities of each sorts are like this:
Merge Sort: O(n.log(n)) / Quick Sort: average O(n.log(n)) and O(n2) in the worst case (if key array is sorted).
So I executed the programs with Two types of Arrays: sorted and random, with different sizes.
Since I wanted to get the Average time, I have tried 10 times per each case.
Here is the code of Merge & Quick Sort:
#include <iostream>
#include <ctime>
#include <vector>
#include <algorithm>
using namespace std;
void Merge(vector<int>& s, int low, int mid, int high) {
int i = low;
int j = mid + 1;
int k = low;
vector<int> u(s);
while (i <= mid && j <= high) {
if (s.at(i) < s.at(j)) {
u.at(k) = s.at(i);
i++;
} else {
u.at(k) = s.at(j);
j++;
}
k++;
}
if (i > mid) {
for (int a = j; a < high + 1; a++) {
u.at(k) = s.at(a);
k++;
}
} else {
for (int a = i; a < mid + 1; a++) {
u.at(k) = s.at(a);
k++;
}
}
for (int a = low; a < high + 1; a++)
s.at(a) = u.at(a);
}
void MergeSort(vector<int>& s, int low, int high) {
int mid;
if (low < high) {
mid = (low + high) / 2;
MergeSort(s, low, mid);
MergeSort(s, mid + 1, high);
Merge(s, low, mid, high);
}
}
void swap(int& a, int& b) {
int tmp = a;
a = b;
b = tmp;
}
void Partition(vector<int>& s, int low, int high, int& pvpoint) {
int j;
int pvitem;
pvitem = s.at(low);
j = low;
for (int i = low + 1; i <= high; i++) {
if (s.at(i) < pvitem) {
j++;
swap(s.at(i), s.at(j));
}
pvpoint = j;
swap(s.at(low), s.at(pvpoint));
}
}
void QuickSort(vector<int>& s, int low, int high) {
int pvpoint;
if (high > low) {
Partition(s, low, high, pvpoint);
QuickSort(s, low, pvpoint - 1);
QuickSort(s, pvpoint + 1, high);
}
}
And each of these main() functions are printing the execution times in SORTED, and RANDOM key arrays.
you can see the result with adding one of these main functions in Visual Studio(C++):
//Sorted key array
int main() {
int s;
for (int i = 1; i < 21; i++) { //Size is from 300 to 6000
s = i * 300;
vector<int> Arr(s);
cout << "N : " << s << "\n";
//Assign Random numbers to each elements
Arr.front() = rand() % Arr.size();
for (int j = 1; j < Arr.size(); j++) { Arr.at(j) = ((737 * Arr.at(j - 1) + 149) % (Arr.size() * 5)); }
sort(Arr.begin(), Arr.end());
//QuickSort(Arr, 0, Arr.size() - 1); <- you can switch using this instead of MergeSort(...) below
for (int i = 0; i < 10; i++) { //print 10 times of execution time
clock_t start, end;
start = clock();
MergeSort(Arr, 0, Arr.size() - 1);
end = clock() - start;
printf("%12.3f ", (double)end * 1000.0 / CLOCKS_PER_SEC);
}
cout << endl;
}
return 0;
}
//Random key array
int main() {
int s;
for (int i = 1; i < 21; i++) {
s = i * 3000;
vector<int> Arr(s);
cout << "N : " << s << "\n";
for (int i = 0; i < 10; i++) {
//Assign Random numbers to each elements
Arr.front() = rand() % Arr.size();
for (int j = 1; j < Arr.size(); j++) { Arr.at(j) = ((737 * Arr.at(j - 1) + 149) % (Arr.size() * 5)); }
//QuickSort(Arr, 0, Arr.size() - 1); <- you can switch using this instead of MergeSort(...) below
clock_t start, end;
start = clock();
MergeSort(Arr, 0, Arr.size() - 1);
end = clock() - start;
printf("%12.3f ", (double)end * 1000.0 / CLOCKS_PER_SEC);
}
cout << endl;
}
return 0;
}
And the THING is, the result is not matching with their time complexity. for example, Merge sort in(RANDOM Array)
size N=3000 prints 20 ms, but size N=60000 prints 1400~1600 ms !! it supposed to print almost 400 ms because Time complexity (Not in worse case) in Quick Sort is O(n.log(n)), isn't it? I want to know what affects to this time and how could I see the printed time that I expected.

You posted the same code in this question: Calculate Execution Times in Sort algorithm and you did not take my answer into account.
Your MergeSort function has a flaw: you duplicate the whole array in merge causing a lot of overhead and quadratic time complexity. This innocent looking definition: vector<int> u(s); defines u as a vector initialized as a copy of s, the full array.
C++ is a very powerful language, often too powerful, littered with traps and pitfalls such as this. It is a very good thing you tried to verify that your program meets the expected performance from the known time complexity of the algorithm. Such a concern is alas too rare.

Here are some guidelines:
For getting execution time:
#include <time.h>
int main()
{
struct timeval stop, start;
int arr[10000];
gettimeofday(&start, NULL);
mergeSort(arr, 0, 9999);
gettimeofday(&stop, NULL);
printf("Time taken for Quick sort is: %ld microseconds\n",
(stop.tv_sec-start.tv_sec)*1000000+stop.tv_usec-start.tv_usec);
}

Threads indexing out of bounds in CUDA kernel

I am running a CUDA kernel which seems to be indexing out of bounds and I can not figure out why. I get error 8 write-of-size in cuda-memcheck.
I have tried to change the number of blocks and the number of threads in each block as well as only running a fraction of all iterations needed. Here is some usefull information as well as a replicable example which gives the error:
blockSize: 128
numBlocks: 512
Nvidia GTX 970
#include <iostream>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector>
#include <iterator>
#include <cuda_profiler_api.h>
#include <algorithm>
#include <cmath>
#include <numeric>
#include <stdio.h>
#include <fstream>
__host__
int NchooseK(const int &N, const int &K)
{
int result = 1;
for (int i = 1; i <= K; i++)
{
result *= N - (K - i);
result /= i;
}
return result;
}
__host__
inline int get_flatten_size(const unsigned int N){
int sum = 0;
for(int i=1; i<=N ; i++){
sum +=i*NchooseK(N,i);
}
return sum;
}
__host__
std::vector<int> comb(const int &N, const int &K, const int &length)
//void comb(int N, int K, int length)
{
int k;
std::vector<int> vec(K);
std::vector<int> flatten_vec(0);
std::string bitmask(K, 1); // K leading 1's
bitmask.resize(N, 0); // N-K trailing 0's
for (int j = 0; j < length; j++) {
k = 0;
for (int i = 0; i < N; ++i) // [0..N-1] integers
{
if (bitmask[i]) {
//std::cout << i << " ";
vec[k] = i;
k++;
}
//std::cout << std::endl;
}
std::prev_permutation(bitmask.begin(), bitmask.end());
flatten_vec.insert(flatten_vec.end(), vec.begin(),vec.end());
}
return flatten_vec;
}
__host__
void get_matrix_indices(const unsigned int N, int *sub_col, int *sub_size, int *cumulative_size)
{
int size, itterator = 0;
cumulative_size[0] = 0;
std::vector<int> size_i_columns;
std::vector<int> all_columns(0);
for(int i=1; i<=N; i++){
size = NchooseK(N,i);
size_i_columns = comb(N,i,size);
for(int j=0; j<size; j++){
sub_size[itterator]=i;
cumulative_size[itterator+1]=cumulative_size[itterator]+i;
itterator++;
}
all_columns.insert(all_columns.end(),size_i_columns.begin(),size_i_columns.end());
}
//sub_col = &all_columns[0];
for(int i = 0; i < all_columns.size(); i++) sub_col[i] = all_columns[i];
}
__global__
void comb_ols(const unsigned int M, const unsigned int N, int* sub_col, int *sub_size, int* cumulative_size, const unsigned int numberOfCalculations, const unsigned int max_size){
int size;
int start_index;
int index = blockIdx.x*blockDim.x+threadIdx.x;
int stride = blockDim.x*gridDim.x;
double *sub_matrix = new double[M*(1+max_size)];
for(int i = index; i < numberOfCalculations; i+=stride){
size = sub_size[i];
start_index = cumulative_size[i];
for(int j = 0; j < size; j++){
for(int k = 0; k<M; k++){
sub_matrix[k] = 1;
}
}
}
delete [] sub_matrix;
}
And then we the main function:
int main()
{
int N = 17;
int M = 263;
const unsigned int regressors = N-1;
const unsigned int numberOfCalculations = (int) (exp2((double) regressors) - 1);
const unsigned int size_sub_col = get_flatten_size(regressors);
int blockSize =128;
int numBlocks = (numberOfCalculations + blockSize-1)/blockSize;
std::cout << "\nblockSize :" << blockSize;
std::cout << "\nnumBlocks :" << numBlocks;
std::cout << "\nblockSize*numBlocks :" << blockSize*numBlocks;
std::cout << "\nregressors :" << regressors;
std::cout << "\nNumberOfCalculations :" << numberOfCalculations;
std::cout << "\nsize_sub_col :" << size_sub_col << '\n' ;
int *sub_size, *cumulative_size, *sub_columns;
cudaMallocManaged(&sub_size, numberOfCalculations*sizeof(int));
cudaMallocManaged(&cumulative_size, (numberOfCalculations+1)*sizeof(int));
cudaMallocManaged(&sub_columns, size_sub_col*sizeof(int));
get_matrix_indices(regressors,sub_columns, sub_size, cumulative_size);
const unsigned int max_size = N*M;
cudaProfilerStart();
comb_ols<<<numBlocks, blockSize>>>(M,N,sub_columns, sub_size, cumulative_size, numberOfCalculations, max_size);
cudaProfilerStop();
cudaDeviceSynchronize();
cudaFree(sub_size);
cudaFree(cumulative_size);
cudaFree(sub_columns);
return 0;
}
I fail to see why the threads would try to access illegal memory space. The way I understood is that the matrix sub_matrix will be initilized on each thread once and then the parallel for loop happens. Thus should each thread have the necessary memory space. Am I allocating too much memory on the GPU? How is "new sub_matrix" handled here?

If I read your code correctly, each thread is attempting to allocate M * (1 + M*N) doubles, which is 263 * ( 1 + 263*17) = ‭1,176,136‬ doubles, or 8.97Mb of heap memory per thread. You launch 128 * 512 threads. That would mean you require 588Gb of heap space for the kernel to run successfully.
Clearly your GPU lacks that amount of memory and the out of bounds memory access comes from failures in the new call (which you can check for, BTW).
Might I suggest that something in the size calculations for the heap memory you require is wrong. Otherwise you have an extremely unrealistic problem for the GPU and will require some other approach.
Note that even if you manage to redesign things to limit the code to a feasible malloc heap memory size, you will still need, in all likelihood, to resize the malloc heap to a suitable size before running the kernel. The cudaDeviceSetLimit API can be used for this.

GNU Radio circular buffer manipulation

I encountered the following error
gr::log :WARN: tpb_thread_body - asynchronous message buffer overflowing, dropping message
Out of serendipity, I ran into this GNU Radio presentation on
Youtube.
The presenter mentioned an OOT block he called "buffer" that is capable of eliminating the "buffer overflowing" error. Apparently, this block plays with different sample rates and the so-called "circular buffers". I haven't worked with circular buffers myself. Any ideas on circular buffers or any hints on how to build this buffer block are welcome.
EDIT
Below is the flowgraph that generates the error. As it was suggested in the comments, the culprits could be the message processing blocks (red-circled) namely generateCADU (for generating standard CCSDS frames) and processCADU (for extracting CADUs from a data stream).
The implementation file of the generateCADU block is given below
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <gnuradio/io_signature.h>
#include "generateCADU_impl.h"
#include "fec/ReedSolomon/ReedSolomon.h"
#include "fec/Scrambler/Scrambler.h"
namespace gr {
namespace ccsds {
generateCADU::sptr
generateCADU::make(int frameLength,std::string sync, int scramble, int rs, int intDepth)
{
return gnuradio::get_initial_sptr
(new generateCADU_impl(frameLength, sync, scramble, rs, intDepth));
}
/*
* The private constructor
*/
generateCADU_impl::generateCADU_impl(int frameLength,std::string sync, int scramble, int rs, int intDepth)
: gr::sync_block("generateCADU",
gr::io_signature::make(1, 1, sizeof(unsigned char)),
gr::io_signature::make(0, 0, 0)),
d_frameLength(frameLength),d_scramble(scramble == 1),d_rs(rs >= 1), d_basis(rs >= 2), d_intDepth(intDepth)
{
set_output_multiple(d_frameLength);
//Registering output port
message_port_register_out(pmt::mp("out"));
d_sync = parse_string(sync);
}
/*
* Our virtual destructor.
*/
generateCADU_impl::~generateCADU_impl()
{
}
unsigned char
generateCADU_impl::parse_hex(char c)
{
if ('0' <= c && c <= '9') return c - '0';
if ('A' <= c && c <= 'F') return c - 'A' + 10;
if ('a' <= c && c <= 'f') return c - 'a' + 10;
std::abort();
}
std::vector<unsigned char>
generateCADU_impl::parse_string(const std::string & s)
{
if (s.size() % 2 != 0) std::abort();
std::vector<unsigned char> result(s.size() / 2);
for (std::size_t i = 0; i != s.size() / 2; ++i)
result[i] = 16 * parse_hex(s[2 * i]) + parse_hex(s[2 * i + 1]);
return result;
}
int
generateCADU_impl::work(int noutput_items,
gr_vector_const_void_star &input_items,
gr_vector_void_star &output_items)
{
const unsigned char *in = (const unsigned char *) input_items[0];
//Reed-Solomon and Scrambler objects
ReedSolomon RS(16,d_intDepth,d_basis);// False = conventional, True = dual-basis
Scrambler S;
//Buffers
unsigned char *frameBuffer1 = (unsigned char*)malloc(d_frameLength*sizeof(unsigned char));
std::vector<unsigned char> frameBuffer2;
//The work function engine
for(int i = 0; (i + d_frameLength) < noutput_items; i += d_frameLength)
{
//Copying data from input stream
memcpy(frameBuffer1,in + i + d_frameLength,d_frameLength);
//Copying frame into std::vector buffer
frameBuffer2.insert(frameBuffer2.begin(),frameBuffer1, frameBuffer1 + d_frameLength);
//Optional scrambling and Reed-Solomon
if (d_rs) RS.Encode_RS(frameBuffer2);
if (d_scramble) S.Scramble(frameBuffer2);
//Insert sync word
frameBuffer2.insert(frameBuffer2.begin(), d_sync.begin(), d_sync.end());
//Transmitting PDU
pmt::pmt_t pdu(pmt::cons(pmt::PMT_NIL,pmt::make_blob(frameBuffer2.data(),frameBuffer2.size())));
message_port_pub(pmt::mp("out"), pdu);
//Clear buffer
frameBuffer2.clear();
}
// Tell runtime system how many output items we produced.
return noutput_items;
}
} /* namespace ccsds */
} /* namespace gr */
And here is the processCADU block. This block uses tags generated by the synchronizeCADU (which is simply a wrapper for the correlate_access_tag block) to extract CADUs
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <gnuradio/io_signature.h>
#include "processCADU_impl.h"
#include "fec/ReedSolomon/ReedSolomon.h"
#include "fec/Scrambler/Scrambler.h"
namespace gr {
namespace ccsds {
processCADU::sptr
processCADU::make(int frameLength, int scramble, int rs, int intDepth, std::string tagName)
{
return gnuradio::get_initial_sptr
(new processCADU_impl(frameLength, scramble, rs, intDepth, tagName));
}
/*
* The private constructor
*/
processCADU_impl::processCADU_impl(int frameLength, int scramble, int rs, int intDepth, std::string tagName)
: gr::sync_block("processCADU",
gr::io_signature::make(1, 1, sizeof(unsigned char)),
gr::io_signature::make(0, 0, 0)),
d_frameLength(frameLength),d_scramble(scramble == 1),d_rs(rs >= 1), d_basis(rs >= 2), d_intDepth(intDepth)
{
//Multiple input
set_output_multiple(d_frameLength * 8);
//Registering output port
message_port_register_out(pmt::mp("out"));
if (d_rs) d_frameLength += 32 * d_intDepth;
//SEtting tag name
key = pmt::mp(tagName);
}
/*
* Our virtual destructor.
*/
processCADU_impl::~processCADU_impl()
{
delete d_pack;
}
int
processCADU_impl::work(int noutput_items,
gr_vector_const_void_star &input_items,
gr_vector_void_star &output_items)
{
const unsigned char *in = (const unsigned char *) input_items[0];
unsigned char *out = (unsigned char *) output_items[0];
void *msg_data = NULL;
unsigned char frame_data[d_frameLength];
unsigned char frame_len = 0;
std::vector<unsigned char> frameBuffer;
//Reed-Solomon and Scrambler objects
ReedSolomon RS(16,d_intDepth,d_basis);// False = conventional, True = dual-basis
std::vector<int> errors;//errors.push_back(0);
Scrambler S;
d_tags.clear();
d_pack = new blocks::kernel::pack_k_bits(8);
this->get_tags_in_window(d_tags, 0, 0, noutput_items,key);
for(d_tags_itr = d_tags.begin(); d_tags_itr != d_tags.end(); d_tags_itr++) {
// Check that we have enough data for a full frame
if ((d_tags_itr->offset - this->nitems_read(0)) > (noutput_items - (d_frameLength) * 8))
{
return (d_tags_itr->offset - this->nitems_read(0) - 1);
}
//Pack bits into bytes
d_pack->pack(frame_data, &in[d_tags_itr->offset - this->nitems_read(0)], d_frameLength);
//Copying frame into std::vector buffer
frameBuffer.insert(frameBuffer.begin(),frame_data, frame_data + d_frameLength);
//Optional scrambling and Reed-Solomon
if (d_scramble) S.Scramble(frameBuffer);
//if (d_rs) RS.Decode_RS(frameBuffer,errors);
//If there is Reed-Solomon decoding
if(d_rs)
{
RS.Decode_RS(frameBuffer,errors);
if (RS.Success(errors)) // Success
{
//std::cout << "Success" << std::endl;
pmt::pmt_t pdu(pmt::cons(pmt::PMT_NIL,pmt::make_blob(frameBuffer.data(),frameBuffer.size())));
message_port_pub(pmt::mp("out"), pdu);
/*for(int i=0; i < errors.size(); i++)
{
//std::cout << "Number of Errors : " << errors.at(i) << std::endl << std::endl;
}*/
}
else // Failure
{
std::cout << "RS failure" << std::endl;
}
}
else{
pmt::pmt_t pdu(pmt::cons(pmt::PMT_NIL,pmt::make_blob(frameBuffer.data(),frameBuffer.size())));
message_port_pub(pmt::mp("out"), pdu);
}
//Clear buffers
frameBuffer.clear();
errors.clear();
}
// Tell runtime system how many output items we produced.
return noutput_items;
}
} /* namespace ccsds */
} /* namespace gr */
Regards,
M

Thanks to #MarcusMüller suggestion, using the tagged_stream paradigma as opposed to PDUs solved the problem. I was able to transmit 47 terabytes of data without any problems. Below is the code for the newly implemented block.
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <gnuradio/io_signature.h>
#include "genCADU_impl.h"
namespace gr {
namespace ccsds {
genCADU::sptr
genCADU::make(int frameLength,std::string sync, int scramble, int rs, int intDepth, std::string len_tag_key)
{
return gnuradio::get_initial_sptr
(new genCADU_impl(frameLength, sync, scramble, rs, intDepth, len_tag_key));
}
/*
* The private constructor
*/
genCADU_impl::genCADU_impl(int frameLength,std::string sync, int scramble, int rs, int intDepth, std::string len_tag_key)
: gr::tagged_stream_block("genCADU",
gr::io_signature::make(1, 1, sizeof(unsigned char)),
gr::io_signature::make(1, 1, sizeof(unsigned char)),len_tag_key),
d_frameLength(frameLength),d_scramble(scramble == 1),d_rs(rs >= 1), d_basis(rs >= 2), d_intDepth(intDepth)
{
//Synchronization pattern
d_sync = parse_string(sync);
//Reed-Solomon and Scrambler objects
RS = new ReedSolomon(16,d_intDepth,d_basis);// False = conventional, True = dual-basis
S = new Scrambler();
}
/*
* Our virtual destructor.
*/
genCADU_impl::~genCADU_impl()
{
delete RS;
delete S;
}
int
genCADU_impl::calculate_output_stream_length(const gr_vector_int &ninput_items)
{
int noutput_items = (d_rs) ? d_frameLength + 32*d_intDepth + d_sync.size() : d_frameLength + d_sync.size();
return noutput_items ;
}
unsigned char
genCADU_impl::parse_hex(char c)
{
if ('0' <= c && c <= '9') return c - '0';
if ('A' <= c && c <= 'F') return c - 'A' + 10;
if ('a' <= c && c <= 'f') return c - 'a' + 10;
std::abort();
}
std::vector<unsigned char>
genCADU_impl::parse_string(const std::string & s)
{
if (s.size() % 2 != 0) std::abort();
std::vector<unsigned char> result(s.size() / 2);
for (std::size_t i = 0; i != s.size() / 2; ++i)
result[i] = 16 * parse_hex(s[2 * i]) + parse_hex(s[2 * i + 1]);
return result;
}
int
genCADU_impl::work (int noutput_items,
gr_vector_int &ninput_items,
gr_vector_const_void_star &input_items,
gr_vector_void_star &output_items)
{
const unsigned char *in = (const unsigned char *) input_items[0];
unsigned char *out = (unsigned char *) output_items[0];
int total_len;
//Copy pdu from circular buffer to local buffer
buffer.insert(buffer.end(), in, in + d_frameLength);
//Optional scrambling and Reed-Solomon. TO DO: Turbo and LDPC
if (d_rs) RS->Encode_RS(buffer);
if (d_scramble) S->Scramble(buffer);
//Insert sync word
buffer.insert(buffer.begin(), d_sync.begin(), d_sync.end());
//Copy from local buffer to circular buffer
std::copy(buffer.begin(),buffer.end(),out);
//Clear the local buffer
total_len = buffer.size();
buffer.clear();
// Tell runtime system how many output items we produced.
return total_len;
}
} /* namespace ccsds */
} /* namespace gr */
Regards,
M.

compare images using systemC

I wrote in this forum asking for help to solve this problem that took ame a lot of my time,i write my first program using systemC, I will expain my aim as much as I can , I stored 2 matrix of pixel value of image in two different text files, I write a systemC code that load two matrix and apply somme of absolute difference, if number of different superior of a Threshold the code displays message (motion).
My code composed of two modules, the first module check if there a number stored in a text file, if yes this Module will automates the other module to load the two matrix and compare them, I really need this code for my project graduation any help or suggestion.
#include "systemC.h"
#include "string.h"
#include "stdio.h"
#include"stdlib.h"
#include <time.h>
#include <math.h> /* fabs */
#include <fstream>
#include <iostream>
#include <fstream>
using namespace std;
#define _CRT_SECURE_NO_WARNINGS
_CRT_SECURE_NO_WARNINGS
double elapsed;
int H = 0;
int D = 0;
int a, b;
int in = false;
int L = 0;
char *mode1 = "r";
char *mode2 = "w";
int i, j, k;
int rows1, cols1, rows2, cols2;
bool fileFound = false;
FILE *SwitchContext;
FILE *image1;
FILE *image2;
FILE *image3;
int sum = 0;
clock_t start = clock();
SC_MODULE(synchronization)
{
sc_in<bool>sig ;
SC_CTOR(synchronization)
{
SC_METHOD(synchroprocess)
}
void synchroprocess()
{
cout << "\n Running Automation";
SwitchContext = fopen("F:/SWITCH CONTEXT.txt", mode2);
fscanf(SwitchContext, "%d", &L);
while (L != 0)
{
cout << "waiting...";
}
sig == true;
}
};
SC_MODULE(imageProcess)
{
sc_in<bool>sig;
SC_CTOR(imageProcess)
{
SC_METHOD(MotionDetector)
sensitive(sig);
}
void MotionDetector()
{
image3 = fopen("F:/image3.txt", mode2);
do
{
char *mode1 = "r";
char *mode2 = "w";
image1 = fopen("F:/image1.txt", mode1);
if (!image1)
{
printf("File Not Found!!\n");
fileFound = true;
}
else
fileFound = false;
}
while (fileFound);
do
{
image2 = fopen("F:/image2.txt", mode1);
if (!image2)
{
printf("File Not Found!!\n");
fileFound = true;
}
else
fileFound = false;
}
while (fileFound);
rows1 = rows2 = 384;
cols1 = cols2 = 512;
int **mat1 = (int **)malloc(rows1 * sizeof(int*));
for (i = 0; i < rows1; i++)
mat1[i] = (int *)malloc(cols1 * sizeof(int));
i = 0;
int **mat2 = (int **)malloc(rows2 * sizeof(int*));
for (i = 0; i < rows2; i++)
mat2[i] = (int *)malloc(cols2 * sizeof(int));
i = 0;
while (!feof(image1))
{
for (i = 0; i < rows1; i++)
{
for (j = 0; j < cols1; j++)
fscanf(image1, "%d%", &mat1[i][j]);
}
}
i = 0;
j = 0;
while (!feof(image2))
{
for (i = 0; i < rows2; i++)
{
for (j = 0; j < cols2; j++)
fscanf(image2, "%d%", &mat2[i][j]);
}
}
i = 0;
j = 0;
printf("\n\n");
for (i = 0; i < rows1; i++)
{
for (j = 0; j < cols1; j++) {
a = abs(mat1[i][j] = mat2[i][j]);
b = b + a;
}
}
i = j = 0;
D = b / 196608;
if (D > 0.9)
{
printf("%d,&K");
printf("MOTION...DETECTED");
getchar();
sc_pause;
for (i = 0; i < rows1; i++) {
for (j = 0; j < cols1; j++)
{
fprintf(image3, "%d ", mat2[i][j]);
}
fprintf(image3, "\n");
}
printf("\n Image Saved....");
std::ofstream mon_fichier("F:\toto.txt");
mon_fichier << elapsed << '\n';
}
fclose(image1);
fclose(image2);
fclose(image3);
clock_t end = clock();
elapsed = ((double)end - start) / CLOCKS_PER_SEC;
printf("time is %f", elapsed);
}
};
int sc_main(int argc, char* argv[])
{
imageProcess master("EE2");
master.MotionDetector();
sc_start();
return(0);
}

What you did is basically wrong.
You copy pasted code to SC_MODULE, this code is simple C code
(Do not mix C and C++ files)
This is not how you use clock
What you should do:
You need to check if your algorithm works, for this you do not need SystemC at all
Then you can replace data types with HW one and check if it still works
Then you have to find which data interface is used in HW and how to use this interface
Then you have to tweak your alg. to work with this interface (There you can use SC_MODULE, sc ports etc...)
Also take look at SC_CTHREAD, you will need it.
Without any informations about target platform I can not provide any other help.

Teaching myself OOP in C++

So I've been working on this program for the last month. The original code is from this tutorial https://www.youtube.com/watch?v=KjHKwCZyAhQ&list=PLHm_I0tE5kKPPWXkTTtOn8fkcwEGZNETh&index=3
However, I thought I would turn it into an object oriented program before I went on. Doing rather than copying is the best way to learn. The code generated a bmp file before i divided it up, but not anymore. The program executes but it doesn't create a file. Additionally I added Hello World in my .cpp files to see if they were even being executed and it looks like they aren't. I realize in copying this that I have a lot of code, I think the problem is in the main file so hopefully if anyone is nice enough to help me they can pick it out much more quickly!
*edit
Also in the original code he had the strut as a global variable but wasn't sure which file to implement it in or even how to make something global in an OOP! Would I just put it in main above int main() ?
Output.h
#pragma once
#include "ProProcess.h" //this is just a bunch of preprocessor directives
//this program creats a single color bmp file using red, blue, and green (rgb)
class OutPut
{
public:
OutPut(const int height, std::string file_name, int dpi, int index);
~OutPut();
//savebmp_str(std::string* file_name, const int width, const int height, int dpi, int pixels, struct RGBtype);
//commented this out because I wasn't sure how I should pass all these values. Ultimately I used OutPut Object_Output in bmp.cpp so that these variables could be passed in there
const int Getwidth() { return width; }
const int Setwidth(const int x) { const int width = x; }
private:
struct RGBtype //Could be a global variable but I dont know which file to put it in
{
int r;
int g;
int b;
};
const int width = 1960; //window size
const int height = 1080;
int dpi = 72;
int number_of_pixels = width*height;
int index;
const char* file_name = "Scene.bmp";
RGBtype *pixels = new RGBtype[number_of_pixels];//creates an array so that each pixel is comprised of a mix of rgb
};
Output.cpp
#include "OutPut.h"
#include "ProProcess.h"
OutPut::OutPut(const int height, std::string file_name, int dpi, int index)
{
OutPut::RGBtype color;
for (int x = 0; x < height; x++) //nested for loop that draws out each pixel totalling 1920x1080 in all
{
for (int y = 0; y < width; y++)
{
index = y*height + x;
pixels[index].r = 311;//changing the number here changes the color
pixels[index].g = 311;
pixels[index].b = 311;
}
}
std::cout << "Hello World";
}
OutPut::~OutPut()
{
}
BMP.h
#pragma once
#include "ProProcess.h"
#include "OutPut.h"
struct RGBtype
{
int r;
int g;
int b;
};
class BMP
{
public:
BMP(const char *filename, int passed_width, int passed_height, int dpi, RGBtype* data);
~BMP();
private:
OutPut Object_Output(std::string* file_name, const int width, const int height, int dpi, int pixels, struct RGBtype);//this is to pass the variables declared in output.h so bmp.h and bmp.cpp can use them too. Not sure how I would even verify i am doing this properly!
//const char* savebmp_str();
int passed_width;
int passed_height;
int dpi;
RGBtype *data;
};
BMP.cpp
#include "BMP.h"
#include "ProProcess.h"
#include "OutPut.h"
BMP::BMP(const char *filename, int passed_width, int passed_height, int dpi, RGBtype *data)
{
std::cout << passed_height;
FILE *pFile;
int k = passed_width*passed_height;
std::cout << "The value k is" << k;
int s = 4 * k;
int filesize = 54 + s; //s is a function of width and height
double factor = 39.375;
int m = static_cast<int>(factor);
int ppm = dpi*m;
unsigned char bmpfileheader[14] = { 'B','M',0,0,0,0 ,0,0,0,0, 54,0,0,0 }; //B and M are case sensitive. They make a bmp file
unsigned char bmpinfoheader[40] = { 40,0,0,0, 0,0,0,0 ,0,0,0,0, 1,0,24,0 };// the header size 14 and 40 are part of the BMP format
bmpfileheader[2] = (unsigned char)(filesize);
bmpfileheader[3] = (unsigned char)(filesize >> 8);
bmpfileheader[4] = (unsigned char)(filesize >> 16);
bmpfileheader[5] = (unsigned char)(filesize >> 24);
bmpinfoheader[4] = (unsigned char)(passed_width);
bmpinfoheader[5] = (unsigned char)(passed_width >> 8);
bmpinfoheader[6] = (unsigned char)(passed_width >> 16);
bmpinfoheader[7] = (unsigned char)(passed_width >> 24);
bmpinfoheader[8] = (unsigned char)(passed_height);
bmpinfoheader[9] = (unsigned char)(passed_height >> 8);
bmpinfoheader[10] = (unsigned char)(passed_height >> 16);
bmpinfoheader[11] = (unsigned char)(passed_height >> 24);
bmpinfoheader[21] = (unsigned char)(s);
bmpinfoheader[22] = (unsigned char)(s >> 8);
bmpinfoheader[23] = (unsigned char)(s >> 16);
bmpinfoheader[24] = (unsigned char)(s >> 24);
bmpinfoheader[25] = (unsigned char)(ppm);
bmpinfoheader[26] = (unsigned char)(ppm >> 8);
bmpinfoheader[27] = (unsigned char)(ppm >> 16);
bmpinfoheader[28] = (unsigned char)(ppm >> 24);
bmpinfoheader[29] = (unsigned char)(ppm);
bmpinfoheader[30] = (unsigned char)(ppm >> 8);
bmpinfoheader[31] = (unsigned char)(ppm >> 16);
bmpinfoheader[32] = (unsigned char)(ppm >> 24);
pFile = fopen(filename, "wb");
fwrite(bmpfileheader, sizeof(char), 14, pFile);
fwrite(bmpinfoheader, sizeof(char), 40, pFile);
for (int i = 0; i < k; i++)
{
RGBtype rgb = data[i];
double red = (data[i].r);
double green = (data[i].g);
double blue = (data[i].b);
int color[3] = { (int)floor(blue), (int)floor(green), (int)floor(red) };
fwrite(color, 1, 3, pFile);
}
fclose(pFile);
std::cout << "Hello World";
}
BMP::~BMP()
{
}
main.cpp
#include <iostream>
#include "OutPut.h"
#include "ProProcess.h"
#include "BMP.h"
int main()
{
OutPut Pixel_gen();
BMP BMP_Format_Maker();
OutPut Object_Output();
system("Pause");
return 0;
}

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Degraded performance of CGAL::intersection between a 32 bit and 64 bit application while multi-threading - cgal

Related

Comparing Execution time with Time Complexity in Merge & Quick Sort

Threads indexing out of bounds in CUDA kernel

GNU Radio circular buffer manipulation

compare images using systemC

Teaching myself OOP in C++

Categories

Resources