how to deal with inline assembly

how to deal with inline assembly - inline-assembly

why sum2 result is 6?? ,here is the code
#if defined(__aarch64__)
int tmp=0;
int sum2=0;
int a1=2;
__asm__ __volatile__
(
"mov %0,3\n\t" //tmp=3
"add %1,%0,%2\n\t" //sum2=tmp+a1=3+2 ???
:"=r"(tmp),"=r"(sum2)
:"r"(a1)
);
LOG_D(TAG,"a=%d\n",sum2);
#endif

Related

why scip making an extra branching decision after calling SCIPincludeBranchRrule?

Now I want to do something on the branch and bound using SCIP, and begin from the branching rule. While I tracking the branching process I found something I cannot understand. Begining from getting the branching variable candidates using SCIPgetLPBranchCands, I get the SCIP_VAR** lpcands, then I select the first variable to branch using SCIPbranchVar. This branching rule works on each focused node.
Consider the branch decision at the number 1 node (the root node), after executing SCIPbranchVar function, SCIPgetChildren return two child nodes (2 and 3), but the child node number of root node changed, two or more child node appeared. Suppose node selector selected the node of number 2 to branch, SCIPgetSiblings return 3 siblings (3, 4, and 5).
To make it clear that what happened, I print the branching decision path using SCIPprintNodeRootPath, and print the relationship between nodes. The result shows that after I branching at a node on the selected variable, the SCIIP branching at the same node on another variable which I don't know.
I have print these information generated by scipoptsuite-7.0.1/scip/examples/Binpacking, no more branching decision is made. But more child nodes appearing after I replace the ryanfoster rule using the branching on the first variable. After that, I try the create child node branching style for my procedure, extra nodes appeared anyway.
I cannot find out what happened and how to totally control the branching process, it is very important for my work in the future for it determined how to design the branching rule. I have even try to read the source code of SCIP, unfortunately, it's too hard for me to read.
My procedure as follows:
main.cpp
/* standard library includes */
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
/* scip includes */
#include "objscip/objscip.h"
#include "objscip/objscipdefplugins.h"
/*user file includes*/
#include "branch_rule.h"
/* namespace usage */
using namespace std;
using namespace scip;
static
SCIP_RETCODE execmain()
{
SCIP* scip = NULL;
/* initialize SCIP environment */
SCIP_CALL( SCIPcreate(&scip) );
SCIP_CALL( SCIPincludeBranchRrule(scip) );
/* include default plugins */
SCIP_CALL( SCIPincludeDefaultPlugins(scip) );
SCIP_CALL( SCIPsetIntParam(scip,"presolving/maxrestarts",0) );
SCIP_CALL( SCIPsetSeparating(scip, SCIP_PARAMSETTING_OFF, TRUE) );
SCIP_CALL(SCIPreadProb(scip, "map18.mps.gz", NULL));
SCIP_CALL( SCIPsolve(scip) );
return SCIP_OKAY;
}
int main(int argc, char** argv)
{
return execmain() != SCIP_OKAY ? 1 : 0;
}
branch_rule.h
#ifndef VRP_BRANCH_RULE_H
#define VRP_BRANCH_RULE_H
#include "scip/scip.h"
#include <vector>
SCIP_RETCODE SCIPincludeBranchRrule(
SCIP* scip
);
#endif //VRP_BRANCH_RULE_H
branch_rule.cpp
#include "branch_rule.h"
#include <assert.h>
#include <string.h>
#include <iostream>
#include <sstream>
#include <iomanip>
#include <fstream>
#include "scip/struct_tree.h"
#include "scip/type_var.h"
using namespace std;
/**#name Branching rule properties
*
* #{
*/
#define BRANCHRULE_NAME "branch rule test"
#define BRANCHRULE_DESC "branch rule test"
#define BRANCHRULE_PRIORITY 50000
#define BRANCHRULE_MAXDEPTH -1
#define BRANCHRULE_MAXBOUNDDIST 1.0
void printBranchTreeNode(SCIP_NODE *node, SCIP_NODE **siblings, int nsiblings){
ofstream f1("branch_path/node_information.txt", ios::app);
if(!f1)return;
f1 << "node number：" << node->number << ", sibling number: " << nsiblings;
if(NULL==node->parent)
f1 << std::endl;
else{
f1 << ", parent number: " << node->parent->number << std::endl;
}
f1 << setw(20) << "siblings：" << std::endl;
for(int i = 0; i < nsiblings; ++i){
f1 << setw(20) << "node number：" << siblings[i]->number << ", sibling number: " << nsiblings << ", parent number: " << siblings[i]->parent->number << endl;
}
f1.close();
}
static
SCIP_DECL_BRANCHEXECLP(branchExeclpTest)
{
assert(scip != NULL);
assert(branchrule != NULL);
assert(strcmp(SCIPbranchruleGetName(branchrule), BRANCHRULE_NAME) == 0);
assert(result != NULL);
SCIP_NODE *current_node = SCIPgetCurrentNode(scip) ;
SCIP_NODE **leaves = NULL;
int nleaves;
SCIP_CALL( SCIPgetLeaves (scip, &leaves, &nleaves) );
std::vector<SCIP_NODE*> leaves_vector;
for(int i =0; i < nleaves; ++i){
leaves_vector.push_back(leaves[i]);
}
SCIP_NODE **current_node_slibings = NULL;
int ncurrent_node_slibings;
SCIP_CALL( SCIPgetSiblings(scip, &current_node_slibings, &ncurrent_node_slibings) );
std::vector<SCIP_NODE*> slibings_vector;
for(int i =0; i < ncurrent_node_slibings; ++i){
slibings_vector.push_back(current_node_slibings[i]);
}
printBranchTreeNode(current_node, current_node_slibings, ncurrent_node_slibings);
SCIP_NODE **childrens;
int nchildrens;
SCIP_CALL( SCIPgetChildren(scip, &childrens, &nchildrens) );
std::vector<SCIP_NODE*> childrens_vector;
for(int i =0; i < nchildrens; ++i){
childrens_vector.push_back(childrens[i]);
}
stringstream filename;
filename.str("");
filename << "branch_path/branch_path_to_node_" << current_node->number <<".dat";
std::string stringFileName = filename.str();
FILE *fp = fopen(stringFileName.c_str(), "wt+");
SCIP_CALL( SCIPprintNodeRootPath(scip, current_node, fp) );
fclose(fp);
// create child node branching style
SCIP_NODE *childsame;
SCIP_NODE *childdiffer;
SCIP_CALL( SCIPcreateChild(scip, &childsame, 0.0, SCIPgetLocalTransEstimate(scip)) );
SCIP_CALL( SCIPcreateChild(scip, &childdiffer, 0.0, SCIPgetLocalTransEstimate(scip)) );
/*
// SCIPbranchVar branching style
SCIP_VAR** lpcands;
SCIP_Real* lpcandssol;
SCIP_Real* lpcandsfrac;
int nlpcands;
int npriolpcands;
int nfracimplvars;
SCIP_CALL( SCIPgetLPBranchCands(scip, &lpcands, &lpcandssol, &lpcandsfrac, &nlpcands, &npriolpcands, &nfracimplvars) );
SCIP_NODE* downchild;
SCIP_NODE* eqchild;
SCIP_NODE* upchild;
SCIP_CALL( SCIPbranchVar(scip, lpcands[0], &downchild, &eqchild, &upchild) );
// SCIP_CALL( SCIPbranchVar(scip, var, NULL, NULL, NULL) );
SCIP_CALL( SCIPgetLeaves (scip, &leaves, &nleaves) );
int numLeaves = SCIPgetNLeaves(scip);
SCIP_CALL( SCIPgetSiblings(scip, &current_node_slibings, &ncurrent_node_slibings) );
slibings_vector.clear();
for(int i =0; i < ncurrent_node_slibings; ++i){
slibings_vector.push_back(current_node_slibings[i]);
}
SCIP_CALL( SCIPgetChildren(scip, &childrens, &nchildrens) );
childrens_vector.clear();
for(int i =0; i < nchildrens; ++i){
childrens_vector.push_back(childrens[i]);
}
*/
return SCIP_OKAY;
}
SCIP_RETCODE SCIPincludeBranchRrule(
SCIP* scip /**< SCIP data structure */
){
SCIP_BRANCHRULEDATA* branchruledata;
SCIP_BRANCHRULE* branchrule;
branchruledata = NULL;
branchrule = NULL;
// include branching rule
SCIP_CALL( SCIPincludeBranchruleBasic(scip, &branchrule, BRANCHRULE_NAME, BRANCHRULE_DESC, BRANCHRULE_PRIORITY, BRANCHRULE_MAXDEPTH,
BRANCHRULE_MAXBOUNDDIST, branchruledata) );
assert(branchrule != NULL);
SCIP_CALL( SCIPsetBranchruleExecLp(scip, branchrule, branchExeclpTest) );
return SCIP_OKAY;
}
Does anyone can help me to solve this problem?

so the first thing I notice when looking at your code is that you do not set the result pointer. After branching, you need to set *result = SCIP_BRANCHED;.
Can you please try that and check if it fixes your problem?

Raku/Perl6: how do you code a NULL with NativeCall

https://docs.perl6.org/language/nativecall
"As you may have predicted by now, a NULL pointer
is represented by the type object of the struct type."
https://learn.microsoft.com/en-us/windows/win32/api/winreg/nf-winreg-regqueryvalueexw
C++
LSTATUS RegQueryValueExW(
HKEY hKey,
LPCWSTR lpValueName,
LPDWORD lpReserved,
LPDWORD lpType,
LPBYTE lpData,
LPDWORD lpcbData
);
lpReserved
This parameter is reserved and must be NULL.
With "native", how do I satisfy the "NULL" requirement?
constant WCHAR := uint16;
constant DWORD := int32;
sub RegQueryValueExW( DWORD, WCHARS, DWORD, DWORD, DWORD is rw, DWORD is rw ) is native("Kernel32.dll") returns DWORD { * };
$RtnCode = RegQueryValueExW( $Handle, $lpValueName, int32, REG_DWORD, $lpData, $lpcbData );
"int32" returns:
Cannot unbox a type object (int32) to int in method
CALL-ME at C:\rakudo\share\perl6\sources \947BDAB9F96E0E5FCCB383124F9
23A6BF6F8D76B (NativeCall) line 587
Many thanks,
-T

To pass a pointer to a DWORD you can use a CArray[DWORD]. For example, here I created a test library libmylib.so with a foo() function taking DWORD * (aka int32_t *) argument:
#include <stdio.h>
#include <stdint.h>
void foo (int32_t *bar) {
if ( bar == NULL ) {
printf( "Got NULL pointer\n" );
}
else {
printf("Got bar: %d\n", bar[0]);
}
}
Then test a Raku interface to this library using:
use v6;
use NativeCall;
constant DWORD := int32;
sub foo(CArray[DWORD]) is native("./libmylib.so") { * };
my #bar := CArray[DWORD].new;
#bar[0] = 1;
foo(#bar);
foo(CArray[DWORD]); # <-- Use a type object to pass a NULL pointer
Output:
Got bar: 1
Got NULL pointer

JJ and Brad on the Perl6 mailing list were correct. For a NULL, just pass it a zero. I had a booboo somewhere else.

CGAL: get the info of nearest neighbors

I am using the Point_set_2 data structure in order to find k nearest neighbors of a query point, I want to retrieve the index of neighbors; I used the following code, but it->info() produces errors!
I have also seen this post, but for me the priority is using Point_set_2 method:
#include <CGAL/Exact_predicates_inexact_constructions_kernel.h>
#include <CGAL/Delaunay_triangulation_2.h>
#include <CGAL/Triangulation_vertex_base_with_info_2.h>
#include <CGAL/Point_set_2.h>
#include <vector>
typedef CGAL::Exact_predicates_inexact_constructions_kernel K;
typedef CGAL::Triangulation_vertex_base_with_info_2<unsigned, K> Vb;
typedef CGAL::Triangulation_data_structure_2<Vb> Tds;
typedef CGAL::Delaunay_triangulation_2<K, Tds> Delaunay;
//typedef Delaunay::Point Point;
typedef CGAL::Point_set_2<K,Tds>::Edge_iterator Edge_iterator;
typedef CGAL::Point_set_2<K,Tds>::Vertex_handle Vertex_handle;
typedef K::Point_2 Point_2;
CGAL::Point_set_2<K,Tds> PSet;
int main()
{
std::vector< std::pair<Point_2,unsigned> > points;
points.push_back( std::make_pair(Point_2(0,0),0) );
points.push_back( std::make_pair(Point_2(1,0),1) );
points.push_back( std::make_pair(Point_2(0,1),2) );
points.push_back( std::make_pair(Point_2(14,4),3) );
points.push_back( std::make_pair(Point_2(2,2),4) );
points.push_back( std::make_pair(Point_2(-4,0),5) );
PSet.insert(points.begin(),points.end());
// init
Point_2 actual(30,45,10);
// nearest neighbor ...
Vertex_handle v = PSet.nearest_neighbor(actual);
std::cout << "Nearest neighbor:" << v->point() << "\n";
// k nearest neighbors ...
std::vector<Vertex_handle> L;
std::vector<Vertex_handle>::const_iterator it;
PSet.nearest_neighbors(actual,5, std::back_inserter(L));
std::cout << "actual point: " << actual << "\n";
for (it=L.begin();it != L.end(); it++)
std::cout << it->info() << "\n";
return 0;
}

A Vertex_handle is roughly equivalent of a pointer. To access it data member you have to either dereference it or use the -> operator.
If you have a vector of Vertex_handle, then iterator are over Vertex_handle
which means that you have to dereference the iterator to access the Vertex_handle. You should write (*it)->info().
Maybe the confusion came from the fact that the iterators of the triangulation are implicitly convertible to the handle types.

Gaussian Elimination in OpenMP - Performance Problems

I'm new to openMP, and I was trying to parallelize a Gaussian Elimination, and I'm having troubles with performance. I'm compiling the code below using:
gcc -o gaussian_elimination gaussian_elimination.c -lm -lgsl -lgslcblas -fopenmp -Wall
And setting the number of threads on the terminal with export OMP_NUM_THREADS
And my problem is that the parallel version of this code is running way slower than the serial version of the same. I believe that this is because I declared #pragma parallel for inside the external loop, and this would force openMP to create and destroy thread at each iteration, which would be incredibly costly, but I haven't seen any other clear way to do the same kind of operation, and I don't think I can exchange the external loop with the internal parallel ones.
I'm probably missing something, but I have not found any other forum threads here commenting on this particular problem. As far as execution correctness goes, my code seems to be functioning alright, the problem is just performance-wise.
Thanks in Advance
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdbool.h>
#include <time.h>
#include <gsl/gsl_linalg.h>
#include <gsl/gsl_rng.h>
#define DEBUG_MODE false
int random_matrix(double *A, int N,long long int seed);
int print_matrix(double *A, int N);
int print_vector(float *b,int N);
int main(int argc, char **argv){
int N=1000;
int i,j,k,l,i_p,s,err,D=N+1;
long long int seed=9089123498274; // just a fixed seed only not to bother
double *A,pivot,sw,tmp,begin,end,time_spent;
double *Aref,*bref;
gsl_matrix_view gsl_m;
gsl_vector_view gsl_b;
gsl_vector *gsl_x;
gsl_permutation *gsl_p;
/* Input */
//scanf("%d",&N);
A = (double*)malloc(N*(N+1)*sizeof(double));
if(A==NULL){
printf("Matrix A not allocated\n");
return 1;
}
Aref = (double*)malloc(N*N*sizeof(double));
if(Aref==NULL){
printf("Matrix A not allocated\n");
return 1;
}
bref = (double*)malloc(N*sizeof(double));
if(bref==NULL){
printf("Vector B not allocated\n");
return 2;
}
/*
for(i=0;i<N;i+=1)
for(j=0;j<N;j+=1)
scanf("%f",&(A[i*N+j]));
for(i=0;i<N;i+=1)
scanf("%f",&(b[i]));
*/
/*
for(i=0;i<N*N;i++)
A[i]=(float) a_data[i];
for(i=0;i<N;i+=1)
b[i]=(float) b_data[i]; */
err= random_matrix(A,N,seed);
if(err!=0)
return err;
for(i=0;i<N;i++)
for(j=0;j<N;j+=1)
Aref[i*N+j]= A[i*D+j];
for(i=0;i<N;i+=1)
bref[i]= A[i*D+N];//b[i];
printf("GSL reference:\n");
gsl_m = gsl_matrix_view_array (Aref, N, N);
gsl_b = gsl_vector_view_array (bref, N);
gsl_x = gsl_vector_alloc (N);
gsl_p = gsl_permutation_alloc(N);
begin = clock();
gsl_linalg_LU_decomp(&gsl_m.matrix, gsl_p, &s);
gsl_linalg_LU_solve(&gsl_m.matrix, gsl_p, &gsl_b.vector, gsl_x);
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("gsl matrix solver: %lf s\n",time_spent);
if(DEBUG_MODE==true)
gsl_vector_fprintf(stdout,gsl_x,"%f");
gsl_permutation_free(gsl_p);
gsl_vector_free(gsl_x);
begin = omp_get_wtime();
for(i=0;i<N;i+=1){
i_p = i;
pivot = fabs(A[i*D+i]);
for(j=i;j<N;j+=1)
if(pivot<fabs(A[j*D+i])){
pivot = fabs(A[j*D+i]);
i_p = j;
}
#pragma omp parallel for shared(i,N,A,i_p) private(j,sw)
for(j=i;j<D;j+=1){
sw = A[i*D+j];
A[i*D+j] = A[i_p*D+j];
A[i_p*D+j] = sw;
}
pivot=A[i*D+i];
#pragma omp parallel for shared(i,D,pivot,A) private(j)
for(j=0;j<D;j++)
A[i*D+j]=A[i*D+j]/pivot;
#pragma omp parallel for shared(i,A,N,D) private(tmp,j,k,l)
for(j=i+1;j<N+i;j++){
k=j%N;
tmp=A[k*D+i];
for(l=0;l<D;l+=1)
A[k*D+l]=A[k*D+l]-tmp*A[i*D+l];
}
}
end = omp_get_wtime();
time_spent = (end - begin);
printf("omp matrix solver: %lf s\n",time_spent);
/* Output */
if(DEBUG_MODE==true){
printf("\nCalculated: \n");
for(i=0;i<N;i+=1)
printf("%.6f \n",A[i*(N+1)+N]);
printf("\n");
}
free(A);
return 0;
}
int random_matrix(double *A, int N,long long int seed){
int i,j;
const gsl_rng_type * T;
gsl_rng *r;
gsl_rng_env_setup();
T = gsl_rng_default;
r = gsl_rng_alloc (T);
for(i=0;i<N;i++)
for(j=0;j<=N;j++)
A[i*(N+1)+j]= gsl_rng_uniform (r);
gsl_rng_free (r);
return 0;
}
int print_matrix(double *A, int N){
int i,j;
for(i=0;i<N;i++)
for(j=0;j<=N+1;j++){
if(j==0 || j==N || j==N+1)
printf(" | ");
printf("%.2f ",A[i*(N+1)+j]);
if(j==N+1)
printf("\n");
}
return 0;
}
int print_vector(float *b,int N){
int i;
for(i=0;i<N;i+=1)
printf("%f\n", b[i]);
return 0;
}
I updated the code above with the omp_get_wtime(), and now it reads as the wtime diminishing as I include more and more threads, so, it does behave as it should, although not as clean as I would like.
For 1000 x 1000 matrices I get 0.25 s for the GSL lib, 4.4 s for the serial omp run and 1.5 s for the 4-thread run.
For 3000 x 3000 matrices, I get ~ 9s for the GSL lib, ~ 117 s for the serial omp run and ~ 44 s for the 4 thread-run, thus at least adding more threads indeed speeds up the program!
Thanks a lot everyone

What structure pcap_t have?

Where the definition of pcap_t? I just found typedef struct pcap pcap_t; in pcap.h but pcap havn't definition there and wincap manual have same problem without description of this or may be i didn't find right. If this on library then may be someone can tell possible structure?

This is one possible answer (here's the source):
struct pcap {
int fd;
int snapshot;
int linktype;
int tzoff; /* timezone offset */
int offset; /* offset for proper alignment */
struct pcap_sf sf;
struct pcap_md md;
/*
* Read buffer.
*/
int bufsize;
u_char *buffer;
u_char *bp;
int cc;
/*
* Place holder for pcap_next().
*/
u_char *pkt;
/*
* Placeholder for filter code if bpf not in kernel.
*/
struct bpf_program fcode;
char errbuf[PCAP_ERRBUF_SIZE];
};

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

how to deal with inline assembly - inline-assembly

why sum2 result is 6?? ,here is the code #if defined(aarch64) int tmp=0; int sum2=0; int a1=2; asm volatile ( "mov %0,3\n\t" //tmp=3 "add %1,%0,%2\n\t" //sum2=tmp+a1=3+2 ??? :"=r"(tmp),"=r"(sum2) :"r"(a1) ); LOG_D(TAG,"a=%d\n",sum2); #endif

Related

why scip making an extra branching decision after calling SCIPincludeBranchRrule?

Raku/Perl6: how do you code a NULL with NativeCall

CGAL: get the info of nearest neighbors

Gaussian Elimination in OpenMP - Performance Problems

What structure pcap_t have?

Categories

Resources

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

how to deal with inline assembly - inline-assembly

why sum2 result is 6?? ,here is the code #if defined(__aarch64__) int tmp=0; int sum2=0; int a1=2; __asm__ __volatile__ ( "mov %0,3\n\t" //tmp=3 "add %1,%0,%2\n\t" //sum2=tmp+a1=3+2 ??? :"=r"(tmp),"=r"(sum2) :"r"(a1) ); LOG_D(TAG,"a=%d\n",sum2); #endif

Related

why scip making an extra branching decision after calling SCIPincludeBranchRrule?

Raku/Perl6: how do you code a NULL with NativeCall

CGAL: get the info of nearest neighbors

Gaussian Elimination in OpenMP - Performance Problems

What structure pcap_t have?

Categories

Resources

why sum2 result is 6?? ,here is the code #if defined(aarch64) int tmp=0; int sum2=0; int a1=2; asm volatile ( "mov %0,3\n\t" //tmp=3 "add %1,%0,%2\n\t" //sum2=tmp+a1=3+2 ??? :"=r"(tmp),"=r"(sum2) :"r"(a1) ); LOG_D(TAG,"a=%d\n",sum2); #endif