Gaussian Elimination in OpenMP - Performance Problems - optimization

I'm new to openMP, and I was trying to parallelize a Gaussian Elimination, and I'm having troubles with performance. I'm compiling the code below using:
gcc -o gaussian_elimination gaussian_elimination.c -lm -lgsl -lgslcblas -fopenmp -Wall
And setting the number of threads on the terminal with export OMP_NUM_THREADS
And my problem is that the parallel version of this code is running way slower than the serial version of the same. I believe that this is because I declared #pragma parallel for inside the external loop, and this would force openMP to create and destroy thread at each iteration, which would be incredibly costly, but I haven't seen any other clear way to do the same kind of operation, and I don't think I can exchange the external loop with the internal parallel ones.
I'm probably missing something, but I have not found any other forum threads here commenting on this particular problem. As far as execution correctness goes, my code seems to be functioning alright, the problem is just performance-wise.
Thanks in Advance
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdbool.h>
#include <time.h>
#include <gsl/gsl_linalg.h>
#include <gsl/gsl_rng.h>
#define DEBUG_MODE false
int random_matrix(double *A, int N,long long int seed);
int print_matrix(double *A, int N);
int print_vector(float *b,int N);
int main(int argc, char **argv){
int N=1000;
int i,j,k,l,i_p,s,err,D=N+1;
long long int seed=9089123498274; // just a fixed seed only not to bother
double *A,pivot,sw,tmp,begin,end,time_spent;
double *Aref,*bref;
gsl_matrix_view gsl_m;
gsl_vector_view gsl_b;
gsl_vector *gsl_x;
gsl_permutation *gsl_p;
/* Input */
//scanf("%d",&N);
A = (double*)malloc(N*(N+1)*sizeof(double));
if(A==NULL){
printf("Matrix A not allocated\n");
return 1;
}
Aref = (double*)malloc(N*N*sizeof(double));
if(Aref==NULL){
printf("Matrix A not allocated\n");
return 1;
}
bref = (double*)malloc(N*sizeof(double));
if(bref==NULL){
printf("Vector B not allocated\n");
return 2;
}
/*
for(i=0;i<N;i+=1)
for(j=0;j<N;j+=1)
scanf("%f",&(A[i*N+j]));
for(i=0;i<N;i+=1)
scanf("%f",&(b[i]));
*/
/*
for(i=0;i<N*N;i++)
A[i]=(float) a_data[i];
for(i=0;i<N;i+=1)
b[i]=(float) b_data[i]; */
err= random_matrix(A,N,seed);
if(err!=0)
return err;
for(i=0;i<N;i++)
for(j=0;j<N;j+=1)
Aref[i*N+j]= A[i*D+j];
for(i=0;i<N;i+=1)
bref[i]= A[i*D+N];//b[i];
printf("GSL reference:\n");
gsl_m = gsl_matrix_view_array (Aref, N, N);
gsl_b = gsl_vector_view_array (bref, N);
gsl_x = gsl_vector_alloc (N);
gsl_p = gsl_permutation_alloc(N);
begin = clock();
gsl_linalg_LU_decomp(&gsl_m.matrix, gsl_p, &s);
gsl_linalg_LU_solve(&gsl_m.matrix, gsl_p, &gsl_b.vector, gsl_x);
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("gsl matrix solver: %lf s\n",time_spent);
if(DEBUG_MODE==true)
gsl_vector_fprintf(stdout,gsl_x,"%f");
gsl_permutation_free(gsl_p);
gsl_vector_free(gsl_x);
begin = omp_get_wtime();
for(i=0;i<N;i+=1){
i_p = i;
pivot = fabs(A[i*D+i]);
for(j=i;j<N;j+=1)
if(pivot<fabs(A[j*D+i])){
pivot = fabs(A[j*D+i]);
i_p = j;
}
#pragma omp parallel for shared(i,N,A,i_p) private(j,sw)
for(j=i;j<D;j+=1){
sw = A[i*D+j];
A[i*D+j] = A[i_p*D+j];
A[i_p*D+j] = sw;
}
pivot=A[i*D+i];
#pragma omp parallel for shared(i,D,pivot,A) private(j)
for(j=0;j<D;j++)
A[i*D+j]=A[i*D+j]/pivot;
#pragma omp parallel for shared(i,A,N,D) private(tmp,j,k,l)
for(j=i+1;j<N+i;j++){
k=j%N;
tmp=A[k*D+i];
for(l=0;l<D;l+=1)
A[k*D+l]=A[k*D+l]-tmp*A[i*D+l];
}
}
end = omp_get_wtime();
time_spent = (end - begin);
printf("omp matrix solver: %lf s\n",time_spent);
/* Output */
if(DEBUG_MODE==true){
printf("\nCalculated: \n");
for(i=0;i<N;i+=1)
printf("%.6f \n",A[i*(N+1)+N]);
printf("\n");
}
free(A);
return 0;
}
int random_matrix(double *A, int N,long long int seed){
int i,j;
const gsl_rng_type * T;
gsl_rng *r;
gsl_rng_env_setup();
T = gsl_rng_default;
r = gsl_rng_alloc (T);
for(i=0;i<N;i++)
for(j=0;j<=N;j++)
A[i*(N+1)+j]= gsl_rng_uniform (r);
gsl_rng_free (r);
return 0;
}
int print_matrix(double *A, int N){
int i,j;
for(i=0;i<N;i++)
for(j=0;j<=N+1;j++){
if(j==0 || j==N || j==N+1)
printf(" | ");
printf("%.2f ",A[i*(N+1)+j]);
if(j==N+1)
printf("\n");
}
return 0;
}
int print_vector(float *b,int N){
int i;
for(i=0;i<N;i+=1)
printf("%f\n", b[i]);
return 0;
}
I updated the code above with the omp_get_wtime(), and now it reads as the wtime diminishing as I include more and more threads, so, it does behave as it should, although not as clean as I would like.
For 1000 x 1000 matrices I get 0.25 s for the GSL lib, 4.4 s for the serial omp run and 1.5 s for the 4-thread run.
For 3000 x 3000 matrices, I get ~ 9s for the GSL lib, ~ 117 s for the serial omp run and ~ 44 s for the 4 thread-run, thus at least adding more threads indeed speeds up the program!
Thanks a lot everyone

Related

How to link the libsvm library in google colab when executing CUDA? What is the proper linking flag for libsvm?

I am working on google colab and i want to use libsvm library in my project. I downloaded libsvm and installed it. Now when i use !nvcc -o command and run the code using CUDA i am getting errors like,
undefined reference to `svm_get_nr_class
undefined reference to 'svm_predict_probability'
undefined reference to `svm_free_and_destroy_model
I guess the problem is that libsvm is not properly linked, As i use -l with proper flags to compile with nvcc, but i don't know what to use with -l to properly link libsvm and use it.
i downloaded libsvm using
!git clone https://github.com/cjlin1/libsvm
%cd libsvm/
!make && make install
%cd /content/libsvm/python/
!make
import sys
sys.path.append('/content/libsvm/python')
%cd /content
now when i run this program
%%cuda --name Blind_Deblurring_Cuda.cu
#include <iostream>
#include <fstream>
#include <iostream>
#include <fstream>
#include "/content/brisque.h"
#include "/content/libsvm/svm.h"
#include <vector>
#include <stdio.h>
#include "fstream"
#include "iostream"
#include <algorithm>
#include <iterator>
#include <cmath>
#include<stdlib.h>
#include <math.h>
#include <curand.h>
#include <opencv2/core/cuda.hpp>
#include <opencv2/core.hpp>
#include "opencv2/imgproc.hpp"
#include "opencv2/imgcodecs.hpp"
#include <opencv2/core/core.hpp>
#include <iostream>
#include "opencv2/highgui.hpp"
#include <opencv2/core/utility.hpp>
//rescaling based on training data i libsvm
float rescale_vector[36][2];
using namespace std;
using namespace cv;
float computescore(string imagename);
void ComputeBrisqueFeature(Mat& orig, vector<double>& featurevector);
int read_range_file() {
//check if file exists
char buff[100];
int i;
string range_fname = "allrange";
FILE* range_file = fopen(range_fname.c_str(), "r");
if(range_file == NULL) return 1;
//assume standard file format for this program
fgets(buff, 100, range_file);
fgets(buff, 100, range_file);
//now we can fill the array
for(i = 0; i < 36; ++i) {
float a, b, c;
fscanf(range_file, "%f %f %f", &a, &b, &c);
rescale_vector[i][0] = b;
rescale_vector[i][1] = c;
}
return 0;
}
int main(int argc, char** argv)
{
if(argc < 2) {
cout << "Input Image argument not given." << endl;
return -1;
}
//read in the allrange file to setup internal scaling array
if(read_range_file()) {
cerr<<"unable to open allrange file"<<endl;
return -1;
}
float qualityscore;
qualityscore = computescore(argv[1]);
cout << "Quality Score: " << qualityscore << endl;
}
float computescore(string imagename) {
// pre-loaded vectors from allrange file
float min_[36] = {0.336999 ,0.019667 ,0.230000 ,-0.125959 ,0.000167 ,0.000616 ,0.231000 ,-0.125873 ,0.000165 ,0.000600 ,0.241000 ,-0.128814 ,0.000179 ,0.000386 ,0.243000 ,-0.133080 ,0.000182 ,0.000421 ,0.436998 ,0.016929 ,0.247000 ,-0.200231 ,0.000104 ,0.000834 ,0.257000 ,-0.200017 ,0.000112 ,0.000876 ,0.257000 ,-0.155072 ,0.000112 ,0.000356 ,0.258000 ,-0.154374 ,0.000117 ,0.000351};
float max_[36] = {9.999411, 0.807472, 1.644021, 0.202917, 0.712384, 0.468672, 1.644021, 0.169548, 0.713132, 0.467896, 1.553016, 0.101368, 0.687324, 0.533087, 1.554016, 0.101000, 0.689177, 0.533133, 3.639918, 0.800955, 1.096995, 0.175286, 0.755547, 0.399270, 1.095995, 0.155928, 0.751488, 0.402398, 1.041992, 0.093209, 0.623516, 0.532925, 1.042992, 0.093714, 0.621958, 0.534484};
double qualityscore;
int i;
struct svm_model* model; // create svm model object
Mat orig = imread(imagename, 1); // read image (color mode)
vector<double> brisqueFeatures; // feature vector initialization
ComputeBrisqueFeature(orig, brisqueFeatures); // compute brisque features
// use the pre-trained allmodel file
string modelfile = "allmodel";
//if((model=svm_load_model(modelfile.c_str()))==0) {
//fprintf(stderr,"can't open model file allmodel\n");
// exit(1);
//}
// float min_[37];
// float max_[37];
struct svm_node x[37];
// rescale the brisqueFeatures vector from -1 to 1
// also convert vector to svm node array object
for(i = 0; i < 36; ++i) {
float min = min_[i];
float max = max_[i];
x[i].value = -1 + (2.0/(max - min) * (brisqueFeatures[i] - min));
x[i].index = i + 1;
}
x[36].index = -1;
int nr_class=svm_get_nr_class(model);
double *prob_estimates = (double *) malloc(nr_class*sizeof(double));
// predict quality score using libsvm class
qualityscore = svm_predict_probability(model,x,prob_estimates);
free(prob_estimates);
svm_free_and_destroy_model(&model);
return qualityscore;
}
void ComputeBrisqueFeature(Mat& orig, vector<double>& featurevector)
{
Mat orig_bw_int(orig.size(), CV_64F, 1);
// convert to grayscale
cvtColor(orig, orig_bw_int, COLOR_BGR2GRAY);
// create a copy of original image
Mat orig_bw(orig_bw_int.size(), CV_64FC1, 1);
orig_bw_int.convertTo(orig_bw, 1.0/255);
orig_bw_int.release();
// orig_bw now contains the grayscale image normalized to the range 0,1
int scalenum = 2; // number of times to scale the image
for (int itr_scale = 1; itr_scale<=scalenum; itr_scale++)
{
// resize image
Size dst_size(orig_bw.cols/cv::pow((double)2, itr_scale-1), orig_bw.rows/pow((double)2, itr_scale-1));
Mat imdist_scaled;
resize(orig_bw, imdist_scaled, dst_size, 0, 0, INTER_CUBIC); // INTER_CUBIC
imdist_scaled.convertTo(imdist_scaled, CV_64FC1, 1.0/255.0);
// calculating MSCN coefficients
// compute mu (local mean)
Mat mu(imdist_scaled.size(), CV_64FC1, 1);
GaussianBlur(imdist_scaled, mu, Size(7, 7), 1.166);
Mat mu_sq;
cv::pow(mu, double(2.0), mu_sq);
//compute sigma (local sigma)
Mat sigma(imdist_scaled.size(), CV_64FC1, 1);
cv::multiply(imdist_scaled, imdist_scaled, sigma);
GaussianBlur(sigma, sigma, Size(7, 7), 1.166);
cv::subtract(sigma, mu_sq, sigma);
cv::pow(sigma, double(0.5), sigma);
add(sigma, Scalar(1.0/255), sigma); // to avoid DivideByZero Error
Mat structdis(imdist_scaled.size(), CV_64FC1, 1);
subtract(imdist_scaled, mu, structdis);
divide(structdis, sigma, structdis); // structdis is MSCN image
// Compute AGGD fit to MSCN image
double lsigma_best, rsigma_best, gamma_best;
structdis = AGGDfit(structdis, lsigma_best, rsigma_best, gamma_best);
featurevector.push_back(gamma_best);
featurevector.push_back((lsigma_best*lsigma_best + rsigma_best*rsigma_best)/2);
// Compute paired product images
// indices for orientations (H, V, D1, D2)
int shifts[4][2]={{0,1},{1,0},{1,1},{-1,1}};
for(int itr_shift=1; itr_shift<=4; itr_shift++)
{
// select the shifting index from the 2D array
int* reqshift = shifts[itr_shift-1];
// declare shifted_structdis as pairwise image
Mat shifted_structdis(imdist_scaled.size(), CV_64F, 1);
// create copies of the images using BwImage constructor
// utility constructor for better subscript access (for pixels)
BwImage OrigArr(structdis);
BwImage ShiftArr(shifted_structdis);
// create pair-wise product for the given orientation (reqshift)
for(int i=0; i<structdis.rows; i++)
{
for(int j=0; j<structdis.cols; j++)
{
if(i+reqshift[0]>=0 && i+reqshift[0]<structdis.rows && j+reqshift[1]>=0 && j+reqshift[1]<structdis.cols)
{
ShiftArr[i][j]=OrigArr[i + reqshift[0]][j + reqshift[1]];
}
else
{
ShiftArr[i][j]=0;
}
}
}
// Mat structdis_pairwise;
shifted_structdis = ShiftArr.equate(shifted_structdis);
// calculate the products of the pairs
multiply(structdis, shifted_structdis, shifted_structdis);
// fit the pairwise product to AGGD
shifted_structdis = AGGDfit(shifted_structdis, lsigma_best, rsigma_best, gamma_best);
double constant = sqrt(tgamma(1/gamma_best))/sqrt(tgamma(3/gamma_best));
double meanparam = (rsigma_best-lsigma_best)*(tgamma(2/gamma_best)/tgamma(1/gamma_best))*constant;
// push the calculated parameters from AGGD fit to pair-wise products
featurevector.push_back(gamma_best);
featurevector.push_back(meanparam);
featurevector.push_back(cv::pow(lsigma_best,2));
featurevector.push_back(cv::pow(rsigma_best,2));
}
}
}
// function to compute best fit parameters from AGGDfit
Mat AGGDfit(Mat structdis, double& lsigma_best, double& rsigma_best, double& gamma_best)
{
// create a copy of an image using BwImage constructor (brisque.h - more info)
BwImage ImArr(structdis);
long int poscount=0, negcount=0;
double possqsum=0, negsqsum=0, abssum=0;
for(int i=0;i<structdis.rows;i++)
{
for (int j =0; j<structdis.cols; j++)
{
double pt = ImArr[i][j]; // BwImage provides [][] access
if(pt>0)
{
poscount++;
possqsum += pt*pt;
abssum += pt;
}
else if(pt<0)
{
negcount++;
negsqsum += pt*pt;
abssum -= pt;
}
}
}
lsigma_best = cv::pow(negsqsum/negcount, 0.5);
rsigma_best = cv::pow(possqsum/poscount, 0.5);
double gammahat = lsigma_best/rsigma_best;
long int totalcount = (structdis.cols)*(structdis.rows);
double rhat = cv::pow(abssum/totalcount, static_cast<double>(2))/((negsqsum + possqsum)/totalcount);
double rhatnorm = rhat*(cv::pow(gammahat,3) +1)*(gammahat+1)/pow(pow(gammahat,2)+1,2);
double prevgamma = 0;
double prevdiff = 1e10;
float sampling = 0.001;
for (float gam=0.2; gam<10; gam+=sampling) //possible to coarsen sampling to quicken the code, with some loss of accuracy
{
double r_gam = tgamma(2/gam)*tgamma(2/gam)/(tgamma(1/gam)*tgamma(3/gam));
double diff = abs(r_gam-rhatnorm);
if(diff> prevdiff) break;
prevdiff = diff;
prevgamma = gam;
}
gamma_best = prevgamma;
return structdis.clone();
}
And then try to compile using
!nvcc -o /content/src/Blind_Deblurring_Cuda /content/src/Blind_Deblurring_Cuda.cu -lopencv_core -lopencv_imgcodecs -lopencv_imgproc -lopencv_highgui -lopencv_ml
It gives the following error
/tmp/tmpxft_00003d8d_00000000-10_Blind_Deblurring_Cuda.o: In function `computescore(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)':
tmpxft_00003d8d_00000000-5_Blind_Deblurring_Cuda.cudafe1.cpp:(.text+0x9bc): undefined reference to `svm_get_nr_class'
tmpxft_00003d8d_00000000-5_Blind_Deblurring_Cuda.cudafe1.cpp:(.text+0x9fd): undefined reference to `svm_predict_probability'
tmpxft_00003d8d_00000000-5_Blind_Deblurring_Cuda.cudafe1.cpp:(.text+0xa27): undefined reference to `svm_free_and_destroy_model'
collect2: error: ld returned 1 exit status

Questions about this serial communication code? [Cortex-M4]

I'm looking at the following code from STMicroelectronics on implementing USART communication with interrupts
#include <stm32f10x_lib.h> // STM32F10x Library Definitions
#include <stdio.h>
#include "STM32_Init.h" // STM32 Initialization
/*----------------------------------------------------------------------------
Notes:
The length of the receive and transmit buffers must be a power of 2.
Each buffer has a next_in and a next_out index.
If next_in = next_out, the buffer is empty.
(next_in - next_out) % buffer_size = the number of characters in the buffer.
*----------------------------------------------------------------------------*/
#define TBUF_SIZE 256 /*** Must be a power of 2 (2,4,8,16,32,64,128,256,512,...) ***/
#define RBUF_SIZE 256 /*** Must be a power of 2 (2,4,8,16,32,64,128,256,512,...) ***/
/*----------------------------------------------------------------------------
*----------------------------------------------------------------------------*/
#if TBUF_SIZE < 2
#error TBUF_SIZE is too small. It must be larger than 1.
#elif ((TBUF_SIZE & (TBUF_SIZE-1)) != 0)
#error TBUF_SIZE must be a power of 2.
#endif
#if RBUF_SIZE < 2
#error RBUF_SIZE is too small. It must be larger than 1.
#elif ((RBUF_SIZE & (RBUF_SIZE-1)) != 0)
#error RBUF_SIZE must be a power of 2.
#endif
/*----------------------------------------------------------------------------
*----------------------------------------------------------------------------*/
struct buf_st {
unsigned int in; // Next In Index
unsigned int out; // Next Out Index
char buf [RBUF_SIZE]; // Buffer
};
static struct buf_st rbuf = { 0, 0, };
#define SIO_RBUFLEN ((unsigned short)(rbuf.in - rbuf.out))
static struct buf_st tbuf = { 0, 0, };
#define SIO_TBUFLEN ((unsigned short)(tbuf.in - tbuf.out))
static unsigned int tx_restart = 1; // NZ if TX restart is required
/*----------------------------------------------------------------------------
USART1_IRQHandler
Handles USART1 global interrupt request.
*----------------------------------------------------------------------------*/
void USART1_IRQHandler (void) {
volatile unsigned int IIR;
struct buf_st *p;
IIR = USART1->SR;
if (IIR & USART_FLAG_RXNE) { // read interrupt
USART1->SR &= ~USART_FLAG_RXNE; // clear interrupt
p = &rbuf;
if (((p->in - p->out) & ~(RBUF_SIZE-1)) == 0) {
p->buf [p->in & (RBUF_SIZE-1)] = (USART1->DR & 0x1FF);
p->in++;
}
}
if (IIR & USART_FLAG_TXE) {
USART1->SR &= ~USART_FLAG_TXE; // clear interrupt
p = &tbuf;
if (p->in != p->out) {
USART1->DR = (p->buf [p->out & (TBUF_SIZE-1)] & 0x1FF);
p->out++;
tx_restart = 0;
}
else {
tx_restart = 1;
USART1->CR1 &= ~USART_FLAG_TXE; // disable TX interrupt if nothing to send
}
}
}
/*------------------------------------------------------------------------------
buffer_Init
initialize the buffers
*------------------------------------------------------------------------------*/
void buffer_Init (void) {
tbuf.in = 0; // Clear com buffer indexes
tbuf.out = 0;
tx_restart = 1;
rbuf.in = 0;
rbuf.out = 0;
}
/*------------------------------------------------------------------------------
SenChar
transmit a character
*------------------------------------------------------------------------------*/
int SendChar (int c) {
struct buf_st *p = &tbuf;
// If the buffer is full, return an error value
if (SIO_TBUFLEN >= TBUF_SIZE)
return (-1);
p->buf [p->in & (TBUF_SIZE - 1)] = c; // Add data to the transmit buffer.
p->in++;
if (tx_restart) { // If transmit interrupt is disabled, enable it
tx_restart = 0;
USART1->CR1 |= USART_FLAG_TXE; // enable TX interrupt
}
return (0);
}
/*------------------------------------------------------------------------------
GetKey
receive a character
*------------------------------------------------------------------------------*/
int GetKey (void) {
struct buf_st *p = &rbuf;
if (SIO_RBUFLEN == 0)
return (-1);
return (p->buf [(p->out++) & (RBUF_SIZE - 1)]);
}
/*----------------------------------------------------------------------------
MAIN function
*----------------------------------------------------------------------------*/
int main (void) {
buffer_Init(); // init RX / TX buffers
stm32_Init (); // STM32 setup
printf ("Interrupt driven Serial I/O Example\r\n\r\n");
while (1) { // Loop forever
unsigned char c;
printf ("Press a key. ");
c = getchar ();
printf ("\r\n");
printf ("You pressed '%c'.\r\n\r\n", c);
} // end while
} // end main
My questions are the following:
In the handler function, when does the statement ((p->in - p->out) & ~(RBUF_SIZE-1)) ever evaluate to a value other than zero? If RBUF_SIZE is a power of 2 as indicated, then ~(RBUF_SIZE-1) should always be zero. Is it checking if p->in > p->out? Even if this isn't true, the conditional should evaluate to zero anyway, right?
In the line following, the statement p->buf [p->in & (RBUF_SIZE-1)] = (USART1->DR & 0x1FF); is made. Why does the code AND p->in with RBUF_SIZE-1?
What kind of buffer are we using in this code? FIFO?
Not so. For example, assuming 32-bit arithmetic, if RBUF_SIZE == 0x00000100 then RBUF_SIZE-1 == 0x000000FF and ~(RBUF_SIZE-1) == 0xFFFFFF00 (it's a bitwise NOT, not a logical NOT). The check you refer to is therefore effectively the same as (p->in - p->out) < RBUF_SIZE, and it's not clear why it is superior. ARM GCC 7.2.1 produces identical length code for the two (-O1).
p->in & (RBUF_SIZE-1) is the same as p->in % RBUF_SIZE when p->in is unsigned. Again, not sure why the former would be used when the latter is clearer; sure, it effectively forces the compiler to compute the modulo using an AND operation, but given that RBUF_SIZE is known at compile time to be a power of two my guess is that most compilers could figure this out (again, ARM GCC 7.2.1 certainly can, I've just tried it - it produces the same instructions either way).
Looks like it. FIFO implemented as a circular buffer.

running gzip on sigle core in muticore environment under unix

I have a requirement to use only single core to test gzip performance in multi-core cpu environment(not sure what is the default settings for gzip in this case). Need help to find out the command to execute gzip compression in single core.
Thanks
gzip is single threaded by default so in effect it will look like it's running on one core ie it might run on several physical cores but it won't be in parallel.
If you absolutely must run on one core and you're on linux you would set affinity to a particular core.
http://man7.org/linux/man-pages/man2/sched_setaffinity.2.html
This is code that I got from the man page.
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/wait.h>
#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \
} while (0)
int
main(int argc, char *argv[])
{
cpu_set_t set;
int parentCPU, childCPU;
int nloops, j;
if (argc != 4) {
fprintf(stderr, "Usage: %s parent-cpu child-cpu num-loops\n",
argv[0]);
exit(EXIT_FAILURE);
}
parentCPU = atoi(argv[1]);
childCPU = atoi(argv[2]);
nloops = atoi(argv[3]);
CPU_ZERO(&set);
switch (fork()) {
case -1: /* Error */
errExit("fork");
case 0: /* Child */
CPU_SET(childCPU, &set);
if (sched_setaffinity(getpid(), sizeof(set), &set) == -1)
errExit("sched_setaffinity");
for (j = 0; j < nloops; j++)
getppid();
exit(EXIT_SUCCESS);
default: /* Parent */
CPU_SET(parentCPU, &set);
if (sched_setaffinity(getpid(), sizeof(set), &set) == -1)
errExit("sched_setaffinity");
for (j = 0; j < nloops; j++)
getppid();
wait(NULL); /* Wait for child to terminate */
exit(EXIT_SUCCESS);
}
}
If you need to test with no interruptions from the kernel you need to write a kernel module for that.

What does PKCS5_PBKDF2_HMAC_SHA1 return value mean?

I'm attempting to use OpenSSL's PKCS5_PBKDF2_HMAC_SHA1 method. I gather that it returns 0 if it succeeds, and some other value otherwise. My question is, what does a non-zero return value mean? Memory error? Usage error? How should my program handle it (retry, quit?)?
Edit: A corollary question is, is there any way to figure this out besides reverse-engineering the method itself?
is there any way to figure this out besides reverse-engineering the method itself?
PKCS5_PBKDF2_HMAC_SHA1 looks like one of those undocumented functions because I can't find it in the OpenSSL docs. OpenSSL has a lot of them, so you should be prepared to study the sources if you are going to use the library.
I gather that it returns 0 if it succeeds, and some other value otherwise.
Actually, its reversed. Here's how I know...
$ grep -R PKCS5_PBKDF2_HMAC_SHA1 *
crypto/evp/evp.h:int PKCS5_PBKDF2_HMAC_SHA1(const char *pass, int passlen,
crypto/evp/p5_crpt2.c:int PKCS5_PBKDF2_HMAC_SHA1(const char *pass, int passlen,
...
So, you find the function's implementation in crypto/evp/p5_crpt2.c:
int PKCS5_PBKDF2_HMAC_SHA1(const char *pass, int passlen,
const unsigned char *salt, int saltlen, int iter,
int keylen, unsigned char *out)
{
return PKCS5_PBKDF2_HMAC(pass, passlen, salt, saltlen, iter,
EVP_sha1(), keylen, out);
}
Following PKCS5_PBKDF2_HMAC:
$ grep -R PKCS5_PBKDF2_HMAC *
...
crypto/evp/evp.h:int PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
crypto/evp/p5_crpt2.c:int PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
...
And again, from crypto/evp/p5_crpt2.c:
int PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
const unsigned char *salt, int saltlen, int iter,
const EVP_MD *digest,
int keylen, unsigned char *out)
{
unsigned char digtmp[EVP_MAX_MD_SIZE], *p, itmp[4];
int cplen, j, k, tkeylen, mdlen;
unsigned long i = 1;
HMAC_CTX hctx_tpl, hctx;
mdlen = EVP_MD_size(digest);
if (mdlen < 0)
return 0;
HMAC_CTX_init(&hctx_tpl);
p = out;
tkeylen = keylen;
if(!pass)
passlen = 0;
else if(passlen == -1)
passlen = strlen(pass);
if (!HMAC_Init_ex(&hctx_tpl, pass, passlen, digest, NULL))
{
HMAC_CTX_cleanup(&hctx_tpl);
return 0;
}
while(tkeylen)
{
if(tkeylen > mdlen)
cplen = mdlen;
else
cplen = tkeylen;
/* We are unlikely to ever use more than 256 blocks (5120 bits!)
* but just in case...
*/
itmp[0] = (unsigned char)((i >> 24) & 0xff);
itmp[1] = (unsigned char)((i >> 16) & 0xff);
itmp[2] = (unsigned char)((i >> 8) & 0xff);
itmp[3] = (unsigned char)(i & 0xff);
if (!HMAC_CTX_copy(&hctx, &hctx_tpl))
{
HMAC_CTX_cleanup(&hctx_tpl);
return 0;
}
if (!HMAC_Update(&hctx, salt, saltlen)
|| !HMAC_Update(&hctx, itmp, 4)
|| !HMAC_Final(&hctx, digtmp, NULL))
{
HMAC_CTX_cleanup(&hctx_tpl);
HMAC_CTX_cleanup(&hctx);
return 0;
}
HMAC_CTX_cleanup(&hctx);
memcpy(p, digtmp, cplen);
for(j = 1; j < iter; j++)
{
if (!HMAC_CTX_copy(&hctx, &hctx_tpl))
{
HMAC_CTX_cleanup(&hctx_tpl);
return 0;
}
if (!HMAC_Update(&hctx, digtmp, mdlen)
|| !HMAC_Final(&hctx, digtmp, NULL))
{
HMAC_CTX_cleanup(&hctx_tpl);
HMAC_CTX_cleanup(&hctx);
return 0;
}
HMAC_CTX_cleanup(&hctx);
for(k = 0; k < cplen; k++)
p[k] ^= digtmp[k];
}
tkeylen-= cplen;
i++;
p+= cplen;
}
HMAC_CTX_cleanup(&hctx_tpl);
return 1;
}
So it looks like 0 on failure, and 1 on success. You should not see other values. And if you get a 0, then all the OUT parameters are junk.
Memory error? Usage error?
Well, sometimes you can call ERR_get_error. If you call it and it makes sense, then the error code is good. If the error code makes no sense, then its probably not good.
Sadly, that's the way I handle it because the library is not consistent with setting error codes. For example, here's the library code to load the RDRAND engine.
Notice the code clears the error code on failure if its a 3rd generation Ivy Bridge (that's the capability being tested), and does not clear or set an error otherwise!!!
void ENGINE_load_rdrand (void)
{
extern unsigned int OPENSSL_ia32cap_P[];
if (OPENSSL_ia32cap_P[1] & (1<<(62-32)))
{
ENGINE *toadd = ENGINE_rdrand();
if(!toadd) return;
ENGINE_add(toadd);
ENGINE_free(toadd);
ERR_clear_error();
}
}
How should my program handle it (retry, quit?)?
It looks like a hard failure.
Finally, that's exactly how I navigate the sources in this situation. If you don't like grep you can try ctags or another source code browser.

Create a Fraction array

I have to Create a dynamic array capable of holding 2*n Fractions.
If the dynamic array cannot be allocated, prints a message and calls exit(1).
It next fills the array with reduced random Fractions whose numerator
is between 1 and 20, inclusive; and whose initial denominator
is between 2 and 20, inclusive.
I ready did the function that is going to create the fraction and reduced it. this is what I got. When I compiled and run this program it crashes I cant find out why. If I put 1 instead of 10 in the test.c It doesn't crash but it gives me a crazy fraction. If I put 7,8,or 11 in the test.c it will crash. I would appreciate if someone can help me.
FractionSumTester.c
Fraction randomFraction(int minNum, int minDenom, int max)
{
Fraction l;
Fraction m;
Fraction f;
l.numerator = randomInt(minNum, max);
l.denominator = randomInt(minDenom, max);
m = reduceFraction(l);
while (m.denominator <= 1)
{
l.numerator = randomInt(minNum, max);
l.denominator = randomInt(minDenom, max);
m = reduceFraction(l);
}
return m;
}
Fraction *createFractionArray(int n)
{
Fraction *p;
int i;
p = malloc(n * sizeof(Fraction));
if (p == NULL)
{
printf("error");
exit(1);
}
for(i=0; i < 2*n ; i++)
{
p[i] = randomFraction(1,2,20);
printf("%d/%d\n", p[i].numerator, p[i].denominator);
}
return p;
}
this is the what I am using to test this two functions.
test.c
#include "Fraction.h"
#include "FractionSumTester.h"
#include <stdio.h>
int main()
{
createFractionArray(10);
return 0;
}
In your createFractionArray() function, you malloc() space for n items. Then, in the for loop, you write 2*n items into that space... which overruns your buffer and causes the crash.