OpenCL Local memory and Xcode - xcode6

I'm trying to learn OpenCL on a Mac, which appears to have some differences in implementation from the OpenCL book I'm reading. I want to be able to dynamically allocate local memory on the GPU. What I'm reading is I need to use the clSetKernelArg function, but that doesn't work within Xcode 6.4. Here's the code as it stands (never mind it's a pointless program, just trying to learn the syntax for shared memory). In Xcode, the kernel is written as a stand-alone .cl file similar to CUDA, so that's a separate file.
add.cl:
kernel void add(int a, int b, global int* c, local int* d)
{
d[0] = a;
d[1] = b;
*c = d[0] + d[1];
}
main.c:
#include <stdio.h>
#include <OpenCL/opencl.h>
#include "add.cl.h"
int main(int argc, const char * argv[]) {
int a = 3;
int b = 5;
int c;
int* cptr = &c;
dispatch_queue_t queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_GPU, NULL);
void* dev_c = gcl_malloc(sizeof(cl_int), NULL, CL_MEM_WRITE_ONLY);
// attempt to create local memory buffer
void* dev_d = gcl_malloc(2*sizeof(cl_int), NULL, CL_MEM_READ_WRITE);
// clSetKernelArg(add_kernel, 3, 2*sizeof(cl_int), NULL);
dispatch_sync(queue, ^{
cl_ndrange range = { 1, {0, 0, 0}, {1, 0, 0}, {1, 0, 0} };
// This gives a warning:
// Warning: Incompatible pointer to integer conversion passing 'cl_int *'
// (aka 'int *') to parameter of type 'size_t' (aka 'unsigned long')
add_kernel(&range, a, b, (cl_int*)dev_c, (cl_int*)dev_d);
gcl_memcpy((void*)cptr, dev_c, sizeof(cl_int));
});
printf("%d + %d = %d\n", a, b, c);
gcl_free(dev_c);
dispatch_release(queue);
return 0;
}
I've tried putting clSetKernelArg where indicated and it doesn't like the first argument:
Error: Passing 'void (^)(const cl_ndrange *, cl_int, cl_int, cl_int *, size_t)' to parameter of incompatible type 'cl_kernel' (aka 'struct _cl_kernel *')
I've looked and looked but can't find any examples illustrating this point within the Xcode environment. Can you point me in the right direction?

Managed to solve this by ditching Apple's extensions and using standard OpenCL 1.2 calls. That means replacing gcl_malloc with clCreateBuffer, replacing dispatch_sync with clEnqueueNDRangeKernel, and most importantly, using clSetKernelArg with NULL in the last argument for local variables. Works like a charm.
Here's the new version:
char kernel_add[1024] =
"kernel void add(int a, int b, global int* c, local int* d) \
{\
d[0] = a;\
d[1] = b;\
*c = d[0] + d[1];\
}";
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <OpenCL/opencl.h>
int main(int argc, const char * argv[]) {
int a = 3;
int b = 5;
int c;
cl_device_id device_id;
int err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
cl_command_queue queue = clCreateCommandQueue(context, device_id, 0, &err);
const char* srccode = kernel;
cl_program program = clCreateProgramWithSource(context, 1, &srccode, NULL, &err);
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "kernel_add", &err);
cl_mem dev_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, NULL);
err = clSetKernelArg(kernel, 0, sizeof(int), &a);
err |= clSetKernelArg(kernel, 1, sizeof(int), &b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dev_c);
err |= clSetKernelArg(kernel, 3, sizeof(int), NULL);
size_t one = 1;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &one, NULL, 0, NULL, NULL);
clFinish(queue);
err = clEnqueueReadBuffer(queue, dev_c, true, 0, sizeof(int), &c, 0, NULL, NULL);
clReleaseMemObject(dev_c);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}

In regular OpenCL, for a kernel parameter declared as a local pointer, you don't allocate a host buffer and pass it in (like you're doing with dev_d). Instead you do a clSetKernelArg with the size of the desired local storage but a NULL pointer (like this: clSetKernelArg(kernel, 2, sizeof(cl_int) * local_work_size[0], NULL)). You'll have to translate that into the Xcode way if you insist on being platform-specific.

Related

GNU Radio circular buffer manipulation

I encountered the following error
gr::log :WARN: tpb_thread_body - asynchronous message buffer overflowing, dropping message
Out of serendipity, I ran into this GNU Radio presentation on
Youtube.
The presenter mentioned an OOT block he called "buffer" that is capable of eliminating the "buffer overflowing" error. Apparently, this block plays with different sample rates and the so-called "circular buffers". I haven't worked with circular buffers myself. Any ideas on circular buffers or any hints on how to build this buffer block are welcome.
EDIT
Below is the flowgraph that generates the error. As it was suggested in the comments, the culprits could be the message processing blocks (red-circled) namely generateCADU (for generating standard CCSDS frames) and processCADU (for extracting CADUs from a data stream).
The implementation file of the generateCADU block is given below
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <gnuradio/io_signature.h>
#include "generateCADU_impl.h"
#include "fec/ReedSolomon/ReedSolomon.h"
#include "fec/Scrambler/Scrambler.h"
namespace gr {
namespace ccsds {
generateCADU::sptr
generateCADU::make(int frameLength,std::string sync, int scramble, int rs, int intDepth)
{
return gnuradio::get_initial_sptr
(new generateCADU_impl(frameLength, sync, scramble, rs, intDepth));
}
/*
* The private constructor
*/
generateCADU_impl::generateCADU_impl(int frameLength,std::string sync, int scramble, int rs, int intDepth)
: gr::sync_block("generateCADU",
gr::io_signature::make(1, 1, sizeof(unsigned char)),
gr::io_signature::make(0, 0, 0)),
d_frameLength(frameLength),d_scramble(scramble == 1),d_rs(rs >= 1), d_basis(rs >= 2), d_intDepth(intDepth)
{
set_output_multiple(d_frameLength);
//Registering output port
message_port_register_out(pmt::mp("out"));
d_sync = parse_string(sync);
}
/*
* Our virtual destructor.
*/
generateCADU_impl::~generateCADU_impl()
{
}
unsigned char
generateCADU_impl::parse_hex(char c)
{
if ('0' <= c && c <= '9') return c - '0';
if ('A' <= c && c <= 'F') return c - 'A' + 10;
if ('a' <= c && c <= 'f') return c - 'a' + 10;
std::abort();
}
std::vector<unsigned char>
generateCADU_impl::parse_string(const std::string & s)
{
if (s.size() % 2 != 0) std::abort();
std::vector<unsigned char> result(s.size() / 2);
for (std::size_t i = 0; i != s.size() / 2; ++i)
result[i] = 16 * parse_hex(s[2 * i]) + parse_hex(s[2 * i + 1]);
return result;
}
int
generateCADU_impl::work(int noutput_items,
gr_vector_const_void_star &input_items,
gr_vector_void_star &output_items)
{
const unsigned char *in = (const unsigned char *) input_items[0];
//Reed-Solomon and Scrambler objects
ReedSolomon RS(16,d_intDepth,d_basis);// False = conventional, True = dual-basis
Scrambler S;
//Buffers
unsigned char *frameBuffer1 = (unsigned char*)malloc(d_frameLength*sizeof(unsigned char));
std::vector<unsigned char> frameBuffer2;
//The work function engine
for(int i = 0; (i + d_frameLength) < noutput_items; i += d_frameLength)
{
//Copying data from input stream
memcpy(frameBuffer1,in + i + d_frameLength,d_frameLength);
//Copying frame into std::vector buffer
frameBuffer2.insert(frameBuffer2.begin(),frameBuffer1, frameBuffer1 + d_frameLength);
//Optional scrambling and Reed-Solomon
if (d_rs) RS.Encode_RS(frameBuffer2);
if (d_scramble) S.Scramble(frameBuffer2);
//Insert sync word
frameBuffer2.insert(frameBuffer2.begin(), d_sync.begin(), d_sync.end());
//Transmitting PDU
pmt::pmt_t pdu(pmt::cons(pmt::PMT_NIL,pmt::make_blob(frameBuffer2.data(),frameBuffer2.size())));
message_port_pub(pmt::mp("out"), pdu);
//Clear buffer
frameBuffer2.clear();
}
// Tell runtime system how many output items we produced.
return noutput_items;
}
} /* namespace ccsds */
} /* namespace gr */
And here is the processCADU block. This block uses tags generated by the synchronizeCADU (which is simply a wrapper for the correlate_access_tag block) to extract CADUs
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <gnuradio/io_signature.h>
#include "processCADU_impl.h"
#include "fec/ReedSolomon/ReedSolomon.h"
#include "fec/Scrambler/Scrambler.h"
namespace gr {
namespace ccsds {
processCADU::sptr
processCADU::make(int frameLength, int scramble, int rs, int intDepth, std::string tagName)
{
return gnuradio::get_initial_sptr
(new processCADU_impl(frameLength, scramble, rs, intDepth, tagName));
}
/*
* The private constructor
*/
processCADU_impl::processCADU_impl(int frameLength, int scramble, int rs, int intDepth, std::string tagName)
: gr::sync_block("processCADU",
gr::io_signature::make(1, 1, sizeof(unsigned char)),
gr::io_signature::make(0, 0, 0)),
d_frameLength(frameLength),d_scramble(scramble == 1),d_rs(rs >= 1), d_basis(rs >= 2), d_intDepth(intDepth)
{
//Multiple input
set_output_multiple(d_frameLength * 8);
//Registering output port
message_port_register_out(pmt::mp("out"));
if (d_rs) d_frameLength += 32 * d_intDepth;
//SEtting tag name
key = pmt::mp(tagName);
}
/*
* Our virtual destructor.
*/
processCADU_impl::~processCADU_impl()
{
delete d_pack;
}
int
processCADU_impl::work(int noutput_items,
gr_vector_const_void_star &input_items,
gr_vector_void_star &output_items)
{
const unsigned char *in = (const unsigned char *) input_items[0];
unsigned char *out = (unsigned char *) output_items[0];
void *msg_data = NULL;
unsigned char frame_data[d_frameLength];
unsigned char frame_len = 0;
std::vector<unsigned char> frameBuffer;
//Reed-Solomon and Scrambler objects
ReedSolomon RS(16,d_intDepth,d_basis);// False = conventional, True = dual-basis
std::vector<int> errors;//errors.push_back(0);
Scrambler S;
d_tags.clear();
d_pack = new blocks::kernel::pack_k_bits(8);
this->get_tags_in_window(d_tags, 0, 0, noutput_items,key);
for(d_tags_itr = d_tags.begin(); d_tags_itr != d_tags.end(); d_tags_itr++) {
// Check that we have enough data for a full frame
if ((d_tags_itr->offset - this->nitems_read(0)) > (noutput_items - (d_frameLength) * 8))
{
return (d_tags_itr->offset - this->nitems_read(0) - 1);
}
//Pack bits into bytes
d_pack->pack(frame_data, &in[d_tags_itr->offset - this->nitems_read(0)], d_frameLength);
//Copying frame into std::vector buffer
frameBuffer.insert(frameBuffer.begin(),frame_data, frame_data + d_frameLength);
//Optional scrambling and Reed-Solomon
if (d_scramble) S.Scramble(frameBuffer);
//if (d_rs) RS.Decode_RS(frameBuffer,errors);
//If there is Reed-Solomon decoding
if(d_rs)
{
RS.Decode_RS(frameBuffer,errors);
if (RS.Success(errors)) // Success
{
//std::cout << "Success" << std::endl;
pmt::pmt_t pdu(pmt::cons(pmt::PMT_NIL,pmt::make_blob(frameBuffer.data(),frameBuffer.size())));
message_port_pub(pmt::mp("out"), pdu);
/*for(int i=0; i < errors.size(); i++)
{
//std::cout << "Number of Errors : " << errors.at(i) << std::endl << std::endl;
}*/
}
else // Failure
{
std::cout << "RS failure" << std::endl;
}
}
else{
pmt::pmt_t pdu(pmt::cons(pmt::PMT_NIL,pmt::make_blob(frameBuffer.data(),frameBuffer.size())));
message_port_pub(pmt::mp("out"), pdu);
}
//Clear buffers
frameBuffer.clear();
errors.clear();
}
// Tell runtime system how many output items we produced.
return noutput_items;
}
} /* namespace ccsds */
} /* namespace gr */
Regards,
M
Thanks to #MarcusMüller suggestion, using the tagged_stream paradigma as opposed to PDUs solved the problem. I was able to transmit 47 terabytes of data without any problems. Below is the code for the newly implemented block.
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <gnuradio/io_signature.h>
#include "genCADU_impl.h"
namespace gr {
namespace ccsds {
genCADU::sptr
genCADU::make(int frameLength,std::string sync, int scramble, int rs, int intDepth, std::string len_tag_key)
{
return gnuradio::get_initial_sptr
(new genCADU_impl(frameLength, sync, scramble, rs, intDepth, len_tag_key));
}
/*
* The private constructor
*/
genCADU_impl::genCADU_impl(int frameLength,std::string sync, int scramble, int rs, int intDepth, std::string len_tag_key)
: gr::tagged_stream_block("genCADU",
gr::io_signature::make(1, 1, sizeof(unsigned char)),
gr::io_signature::make(1, 1, sizeof(unsigned char)),len_tag_key),
d_frameLength(frameLength),d_scramble(scramble == 1),d_rs(rs >= 1), d_basis(rs >= 2), d_intDepth(intDepth)
{
//Synchronization pattern
d_sync = parse_string(sync);
//Reed-Solomon and Scrambler objects
RS = new ReedSolomon(16,d_intDepth,d_basis);// False = conventional, True = dual-basis
S = new Scrambler();
}
/*
* Our virtual destructor.
*/
genCADU_impl::~genCADU_impl()
{
delete RS;
delete S;
}
int
genCADU_impl::calculate_output_stream_length(const gr_vector_int &ninput_items)
{
int noutput_items = (d_rs) ? d_frameLength + 32*d_intDepth + d_sync.size() : d_frameLength + d_sync.size();
return noutput_items ;
}
unsigned char
genCADU_impl::parse_hex(char c)
{
if ('0' <= c && c <= '9') return c - '0';
if ('A' <= c && c <= 'F') return c - 'A' + 10;
if ('a' <= c && c <= 'f') return c - 'a' + 10;
std::abort();
}
std::vector<unsigned char>
genCADU_impl::parse_string(const std::string & s)
{
if (s.size() % 2 != 0) std::abort();
std::vector<unsigned char> result(s.size() / 2);
for (std::size_t i = 0; i != s.size() / 2; ++i)
result[i] = 16 * parse_hex(s[2 * i]) + parse_hex(s[2 * i + 1]);
return result;
}
int
genCADU_impl::work (int noutput_items,
gr_vector_int &ninput_items,
gr_vector_const_void_star &input_items,
gr_vector_void_star &output_items)
{
const unsigned char *in = (const unsigned char *) input_items[0];
unsigned char *out = (unsigned char *) output_items[0];
int total_len;
//Copy pdu from circular buffer to local buffer
buffer.insert(buffer.end(), in, in + d_frameLength);
//Optional scrambling and Reed-Solomon. TO DO: Turbo and LDPC
if (d_rs) RS->Encode_RS(buffer);
if (d_scramble) S->Scramble(buffer);
//Insert sync word
buffer.insert(buffer.begin(), d_sync.begin(), d_sync.end());
//Copy from local buffer to circular buffer
std::copy(buffer.begin(),buffer.end(),out);
//Clear the local buffer
total_len = buffer.size();
buffer.clear();
// Tell runtime system how many output items we produced.
return total_len;
}
} /* namespace ccsds */
} /* namespace gr */
Regards,
M.

How to get parent process ID using objective c in OS X?

I have PID of some process and need to get parent process id. How to get it using objective c?
Original source: http://www.objectpark.net/parentpid.html
#include <sys/sysctl.h>
#define OPProcessValueUnknown UINT_MAX
int ProcessIDForParentOfProcessID(int pid)
{
struct kinfo_proc info;
size_t length = sizeof(struct kinfo_proc);
int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PID, pid };
if (sysctl(mib, 4, &info, &length, NULL, 0) < 0)
return OPProcessValueUnknown;
if (length == 0)
return OPProcessValueUnknown;
return info.kp_eproc.e_ppid;
}

Creating a RAW UDP connection in lwip ARP

I am currently working to create a simple transfer protocol over Ethernet. I have a SP605 Xilinx evaluation board which I am using to debug the Ethernet portion of our project. I attempted to cannibalize the example but have so far been unsuccessful. Currently, the communication needs to only be one way. Currently, I am trying to see the data being sent with netcat. I also have wireshark open and am seeing the system get stuck repeatedly asking:
2217 1323.697811000 Xilinx_00:01:02 Broadcast
ARP 60 Who has 192.168.1.11? Tell 192.168.1.10
I can see the Host computer reply with:
2217 1323.697811000 Xilinx_00:01:02 Broadcast
ARP 60 Who has 192.168.1.11? Tell 192.168.1.10
I feel like I have some issues with the configuration but cannot figure out how what it is. I think it might have something to do with a not having a recv handler set but I am not sure.
Below is the code I am using. lwip_init() is mimicking the call from the examples provided by Xilinx.
/*
* main.c
*
* Created on: Sep 24, 2013
* Author: Ian
*/
#include <stdio.h>
#include <string.h>
#include <stdio.h>
#include "lwip/init.h"
#include "xparameters.h"
#include "netif/xadapter.h"
#include "xenv_standalone.h"
#include "platform_config.h"
#include "xparameters.h"
#include "xintc.h"
#include "xil_exception.h"
#include "mb_interface.h"
#include "xtmrctr_l.h"
#include "lwip/udp.h"
#include "lwipopts.h"
#include "xil_printf.h"
struct ip_addr ipaddr, ipaddr_remote, netmask, gw;
void udp_test(void *arg, struct udp_pcb *pcb, struct pbuf *p, struct ip_addr *addr, u16_t port);
void print_ip(char *msg, struct ip_addr *ip)
{
print(msg);
xil_printf("%d.%d.%d.%d\r\n", ip4_addr1(ip), ip4_addr2(ip),
ip4_addr3(ip), ip4_addr4(ip));
}
void print_ip_settings(struct ip_addr *ip, struct ip_addr *mask, struct ip_addr *gw)
{
print_ip("Board IP: ", ip);
print_ip("Netmask : ", mask);
print_ip("Gateway : ", gw);
}
int main()
{
err_t error;
struct netif *netif, server_netif;
struct udp_pcb *udp_1;
struct pbuf *p;
char data[8] = "01234567";
u16_t Port;
Port = 69;
int count = 0;
int n = 0;
int buflen = 8;
/* the mac address of the board. this should be unique per board */
unsigned char mac_ethernet_address[] = { 0x00, 0x0a, 0x35, 0x00, 0x01, 0x02 };
netif = &server_netif;
xil_printf("\r\n\r\n");
xil_printf("-----lwIP RAW Application ------\r\n");
/* initliaze IP addresses to be used */
IP4_ADDR(&ipaddr_remote, 192, 168, 1, 11);
IP4_ADDR(&ipaddr, 192, 168, 1, 10);
IP4_ADDR(&netmask, 255, 255, 255, 0);
IP4_ADDR(&gw, 192, 168, 1, 1);
print_ip_settings(&ipaddr, &netmask, &gw);
lwip_init();
if (!xemac_add(netif, &ipaddr, &netmask, &gw, mac_ethernet_address, PLATFORM_EMAC_BASEADDR)) {
xil_printf("Error adding N/W interface\r\n");
return -1;
}
netif_set_default(netif);
netif_set_up(netif);
Xil_ExceptionEnable(); //Setup complete start interrupts
udp_1 = udp_new();
error = udp_bind(udp_1, IP_ADDR_ANY, Port);
if (error != 0)
{
xil_printf("Failed %d\r\n", error);
}
else if (error == 0)
{
xil_printf("Success\r\n");
}
error = udp_connect(udp_1, &ipaddr_remote, Port);
if (error != 0)
{
xil_printf("Failed %d\r\n", error);
}
else if (error == 0)
{
xil_printf("Success\r\n");
}
while(1)
{
count++;
xemacif_input(netif);
if (count == 100000)
{
p = pbuf_alloc(PBUF_TRANSPORT, buflen, PBUF_POOL);
if (!p) {
xil_printf("error allocating pbuf\r\n");
return ERR_MEM;
}
memcpy(p->payload, data, buflen);
udp_send(udp_1, p);
xil_printf("SEND\r\n");
count = 0;
pbuf_free(p);
}
}
data[1] = '2';
}
Ok, so basically here is what I found.
The Xilinx xapp1026 had issues with the sp605_AxiEth_32kb_Cache project when I used it. It was hanging at the start interrupts. I was not able to diagnose the project BUT I switched to the sp605_EthernetLite_32kb_Cache example project. I can only assume that the failure of the MicroBlaze interrupts to initialize caused the ARP to fail to get added and forced the system into the loop repeatedly. It is still unclear why the interrupt failed to initialize in the AxiEth example.
Once here I was able to get a program to work by stripping down the provided system and using the following code:
/*
* Copyright (c) 2007 Xilinx, Inc. All rights reserved.
*
* Xilinx, Inc.
* XILINX IS PROVIDING THIS DESIGN, CODE, OR INFORMATION "AS IS" AS A
* COURTESY TO YOU. BY PROVIDING THIS DESIGN, CODE, OR INFORMATION AS
* ONE POSSIBLE IMPLEMENTATION OF THIS FEATURE, APPLICATION OR
* STANDARD, XILINX IS MAKING NO REPRESENTATION THAT THIS IMPLEMENTATION
* IS FREE FROM ANY CLAIMS OF INFRINGEMENT, AND YOU ARE RESPONSIBLE
* FOR OBTAINING ANY RIGHTS YOU MAY REQUIRE FOR YOUR IMPLEMENTATION.
* XILINX EXPRESSLY DISCLAIMS ANY WARRANTY WHATSOEVER WITH RESPECT TO
* THE ADEQUACY OF THE IMPLEMENTATION, INCLUDING BUT NOT LIMITED TO
* ANY WARRANTIES OR REPRESENTATIONS THAT THIS IMPLEMENTATION IS FREE
* FROM CLAIMS OF INFRINGEMENT, IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE.
*
*/
#include <stdio.h>
#include <string.h>
#include "lwip/udp.h"
#include "xparameters.h"
#include "netif/xadapter.h"
#include "platform.h"
#include "platform_config.h"
#include "lwipopts.h"
#ifndef __PPC__
#include "xil_printf.h"
#endif
void print_headers();
int start_applications();
int transfer_data();
void platform_enable_interrupts();
void lwip_init(void);
void tcp_fasttmr(void);
void tcp_slowtmr(void);
#if LWIP_DHCP==1
extern volatile int dhcp_timoutcntr;
err_t dhcp_start(struct netif *netif);
#endif
extern volatile int TxPerfConnMonCntr;
extern volatile int TcpFastTmrFlag;
extern volatile int TcpSlowTmrFlag;
void print_ip(char *msg, struct ip_addr *ip)
{
print(msg);
xil_printf("%d.%d.%d.%d\r\n", ip4_addr1(ip), ip4_addr2(ip),
ip4_addr3(ip), ip4_addr4(ip));
}
void print_ip_settings(struct ip_addr *ip, struct ip_addr *mask, struct ip_addr *gw)
{
print_ip("Board IP: ", ip);
print_ip("Netmask : ", mask);
print_ip("Gateway : ", gw);
}
int main()
{
struct netif *netif, server_netif;
struct ip_addr ipaddr, netmask, gw;
// Added stuff for the creation of a basic UDP
err_t error;
struct ip_addr ip_remote;
struct udp_pcb *udp_1;
struct pbuf *p;
char data[8] = "01234567";
u16_t Port = 12;
int buflen = 8;
int count = 0;
/* the mac address of the board. this should be unique per board */
unsigned char mac_ethernet_address[] = { 0x00, 0x0a, 0x35, 0x00, 0x01, 0x02 };
netif = &server_netif;
if (init_platform() < 0) {
xil_printf("ERROR initializing platform.\r\n");
return -1;
}
xil_printf("\r\n\r\n");
xil_printf("-----lwIP RAW Mode Demo Application ------\r\n");
/* initliaze IP addresses to be used */
#if (LWIP_DHCP==0)
IP4_ADDR(&ipaddr, 192, 168, 1, 10);
IP4_ADDR(&netmask, 255, 255, 255, 0);
IP4_ADDR(&gw, 192, 168, 1, 1);
print_ip_settings(&ipaddr, &netmask, &gw);
#endif
lwip_init();
#if (LWIP_DHCP==1)
ipaddr.addr = 0;
gw.addr = 0;
netmask.addr = 0;
#endif
/* Add network interface to the netif_list, and set it as default */
if (!xemac_add(netif, &ipaddr, &netmask, &gw, mac_ethernet_address, PLATFORM_EMAC_BASEADDR)) {
xil_printf("Error adding N/W interface\r\n");
return -1;
}
netif_set_default(netif);
/* specify that the network if is up */
netif_set_up(netif);
/* now enable interrupts */
platform_enable_interrupts();
#if (LWIP_DHCP==1)
/* Create a new DHCP client for this interface.
* Note: you must call dhcp_fine_tmr() and dhcp_coarse_tmr() at
* the predefined regular intervals after starting the client.
*/
dhcp_start(netif);
dhcp_timoutcntr = 24;
TxPerfConnMonCntr = 0;
while(((netif->ip_addr.addr) == 0) && (dhcp_timoutcntr > 0)) {
xemacif_input(netif);
if (TcpFastTmrFlag) {
tcp_fasttmr();
TcpFastTmrFlag = 0;
}
if (TcpSlowTmrFlag) {
tcp_slowtmr();
TcpSlowTmrFlag = 0;
}
}
if (dhcp_timoutcntr <= 0) {
if ((netif->ip_addr.addr) == 0) {
xil_printf("DHCP Timeout\r\n");
xil_printf("Configuring default IP of 192.168.1.10\r\n");
IP4_ADDR(&(netif->ip_addr), 192, 168, 1, 10);
IP4_ADDR(&(netif->netmask), 255, 255, 255, 0);
IP4_ADDR(&(netif->gw), 192, 168, 1, 1);
}
}
/* receive and process packets */
print_ip_settings(&(netif->ip_addr), &(netif->netmask), &(netif->gw));
#endif
/* start the application (web server, rxtest, txtest, etc..) */
xil_printf("Setup Done");
IP4_ADDR(&ip_remote, 192, 168, 1, 11);
udp_1 = udp_new();
error = udp_bind(udp_1, IP_ADDR_ANY, Port);
if (error != 0)
{
xil_printf("Failed %d\r\n", error);
}
else if (error == 0)
{
xil_printf("Success\r\n");
}
error = udp_connect(udp_1, &ip_remote, Port);
if (error != 0)
{
xil_printf("Failed %d\r\n", error);
}
else if (error == 0)
{
xil_printf("Success\r\n");
}
while (1)
{
xemacif_input(netif);
count++;
if (count == 80000)
{
p = pbuf_alloc(PBUF_TRANSPORT, buflen, PBUF_POOL);
if (!p) {
xil_printf("error allocating pbuf\r\n");
return ERR_MEM;
}
memcpy(p->payload, data, buflen);
udp_send(udp_1, p);
xil_printf("SEND\r\n");
count = 0;
pbuf_free(p);
}
}
/* never reached */
cleanup_platform();
return 0;
}
----Edit ----
So you know how people figure it out then don't leave an answer. Well here was my problem with the orginal code (I think..) the line of code xemacif_input(netif); gives the Ethernet the ability to process the arp call without it the FPGA will sending out the ARP and then not receiving it will ask repeatedly.
The previous code does appear to have the correct line of code in it. So it might have been a mistake in how the interrupts were configured.
I got this example working and implemented it in my project. If you have questions about this please ask and I will try and give the best answers I can.

OpenCL - Kernel crashes on the second run

I am trying to run a code which works the first time but crashes the second time that it runs. The function which causes the crash is part of the class Octree_GPU and is this:
int Octree_GPU::runCreateNodeKernel(int length)
{
cl_uint nodeLength;
if(nodeNumsArray[length-1] == 0)
nodeLength = nodeAddArray[length-1];
else
nodeLength = nodeAddArray[length-1]+8;
nodeArray = (cl_uint*)malloc(sizeof(cl_uint)*nodeLength);
nodePointsArray = (cl_int*)malloc(sizeof(cl_uint)*nodeLength);
startIndexArray = (cl_int*)malloc(sizeof(cl_int)*nodeLength);
d_nodeAdd = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*length, NULL, &err);
d_nodeArray = clCreateBuffer(context,CL_MEM_READ_WRITE, sizeof(cl_uint)*temp_length, NULL, &err);
d_numPoints = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*length, NULL, &err);
d_pointIndex = clCreateBuffer(context, CL_MEM_READ_WRITE,sizeof(cl_uint)*length,NULL, &err);
d_nodePointsArray = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int)*temp_length, NULL, &err);
d_nodeIndexArray = clCreateBuffer(context,CL_MEM_READ_WRITE, sizeof(cl_int)*temp_length, NULL, &err);
err |= clEnqueueWriteBuffer(commands, d_nodeAdd, CL_TRUE, 0, sizeof(cl_uint)*length, nodeAddArray, 0, NULL,NULL);
err |= clEnqueueWriteBuffer(commands, d_numPoints,CL_TRUE, 0, sizeof(cl_uint)*length,numPointsArray,0,NULL,NULL);
err |= clEnqueueWriteBuffer(commands, d_pointIndex, CL_TRUE, 0, sizeof(cl_uint)*length,pointStartIndexArray,0, NULL, NULL);
clFinish(commands);
err = clSetKernelArg(createNodeKernel, 0, sizeof(cl_mem), &d_odata);
err |= clSetKernelArg(createNodeKernel, 1, sizeof(cl_mem), &d_nodeNums);
err |= clSetKernelArg(createNodeKernel, 2, sizeof(cl_mem), &d_nodeAdd);
err |= clSetKernelArg(createNodeKernel, 3, sizeof(cl_mem), &d_numPoints);
err |= clSetKernelArg(createNodeKernel, 4, sizeof(cl_mem), &d_pointIndex);
err |= clSetKernelArg(createNodeKernel, 5, sizeof(cl_mem), &d_nodeArray);
err |= clSetKernelArg(createNodeKernel, 6, sizeof(cl_mem), &d_nodePointsArray);
err |= clSetKernelArg(createNodeKernel, 7, sizeof(cl_mem), &d_nodeIndexArray);
clFinish(commands);
if(err != CL_SUCCESS) {
printf("Cannot set Kernel Arg \n");
exit(1);
}
size_t global_size[1] = {limit-1};
err = clEnqueueNDRangeKernel(commands, createNodeKernel, 1, NULL, global_size, NULL, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf(" Kernel does not work \n");
exit(1);
}
clFinish(commands);
err = clEnqueueReadBuffer(commands, d_nodeArray, CL_TRUE, 0, sizeof(cl_uint)*temp_length, nodeArray, 0, NULL, NULL);
err|= clEnqueueReadBuffer(commands, d_nodePointsArray, CL_TRUE, 0, sizeof(cl_int)*nodeLength, nodePointsArray, 0, NULL, NULL);
err|= clEnqueueReadBuffer(commands, d_nodeIndexArray, CL_TRUE, 0, sizeof(cl_int)*nodeLength, startIndexArray, 0, NULL, NULL);
clFinish(commands);
clReleaseMemObject(d_nodeAdd);
clReleaseMemObject(d_numPoints);
clReleaseMemObject(d_nodeArray);
clReleaseMemObject(d_nodePointsArray);
clFinish(commands);
return 0;
}
Please note that d_odata and d_nodeNums have been declared in the previous functions. The kernel code is given below for the same:
__kernel void createNode(__global int* uniqueCode, __global int* nodeNums,__global int* nodeAdd, __global int* numPoints, __global int* pointIndex,__global int* nodeArray, __global int* nodePoints,__global int* nodeIndex)
{
int ig = get_global_id(0);
int add;
int num = uniqueCode[ig];
int pt = numPoints[ig];
int ind = pointIndex[ig];
int temp,j;
if(nodeNums[ig] == 8)
{
for(int i=0;i<8;i++)
{
temp = ((int)num/10)*10+i;
add = nodeAdd[ig] + i;
nodeArray[add] = temp;
nodePoints[add] = select(0, pt, temp==num);
nodeIndex[add] = select(-1, ind, temp==num);
barrier(CLK_LOCAL_MEM_FENCE);
}
}
else
{
j = num % 10;
nodeAdd[ig] = nodeAdd[ig-1];
add = nodeAdd[ig]+j;
nodePoints[add] = pt;
nodeIndex[add] = ind;
barrier(CLK_LOCAL_MEM_FENCE);
}
}
I have tried to find out why but have not succeeded. I might be overlooking something really simple. Thank you for your help.
I'm not 100% sure this is causing the crash, but where you've written
if(nodeNums[ig] == 8)
{
for(int i=0;i<8;i++)
{
barrier(CLK_LOCAL_MEM_FENCE);
}
}
else
{
barrier(CLK_LOCAL_MEM_FENCE);
}
This means that different threads in a work group will be executing different numbers of barriers, which may cause a hang/crash. A barrier (with CLK_LOCAL_MEM_FENCE) is for synchronising accesses to local memory, so all work items in a group must execute this before continuing
On a non crash note, it looks like you're using CLK_LOCAL_MEM_FENCE (ensure that local memory accesses are visible across threads) when you mean CLK_GLOBAL_MEM_FENCE (ensure that global memory accesses are visible across threads)
Also
nodeAdd[ig] = nodeAdd[ig-1];
Is not correct for ig == 0. This may not be causing the actual crash (because I've found that OpenCL can be unfortunately quite forgiving), but its worth fixing

CUDA Thrust sort_by_key when the key is a tuple dealt with by zip_iterator's with custom comparison predicate

I've looked through a lot of questions here for something similar and there are quite a few, albeit with one minor change. I'm trying to sort values with a zip_iterator as a compound key.
Specifically, I have the following function:
void thrustSort(
unsigned int * primaryKey,
float * secondaryKey,
unsigned int * values,
unsigned int numberOfPoints)
{
thrust::device_ptr dev_ptr_pkey = thrust::device_pointer_cast(primaryKey);
thrust::device_ptr dev_ptr_skey = thrust::device_pointer_cast(secondaryKey);
thrust::device_ptr dev_ptr_values = thrust::device_pointer_cast(values);
thrust::tuple,thrust::device_ptr> keytup_begin =
thrust::make_tuple,thrust::device_ptr>(dev_ptr_pkey, dev_ptr_skey);
thrust::zip_iterator, thrust::device_ptr > > first =
thrust::make_zip_iterator, thrust::device_ptr > >(keytup_begin);
thrust::sort_by_key(first, first + numberOfPoints, dev_ptr_values, ZipComparator());
}
and this custom predicate:
typedef thrust::device_ptr<unsigned int> tdp_uint ;
typedef thrust::device_ptr<float> tdp_float ;
typedef thrust::tuple<tdp_uint, tdp_float> tdp_uif_tuple ;
struct ZipComparator
{
__host__ __device__
inline bool operator() (const tdp_uif_tuple &a, const tdp_uif_tuple &b)
{
if(a.head < b.head) return true;
if(a.head == b.head) return a.tail < b.tail;
return false;
}
};
The errors I'm getting are:
Error 1 error : no instance of constructor "thrust::device_ptr::device_ptr [with T=unsigned int]" matches the argument list C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.0\include\thrust\detail\tuple.inl 309 1 ---
Error 2 error : no instance of constructor "thrust::device_ptr::device_ptr [with T=float]" matches the argument list C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.0\include\thrust\detail\tuple.inl 401 1 ---
Any ideas what might cause this / how do I write a predicate that indeed works?
Thanks in Advance,
Nathan
The comparator takes arguments of type const thrust::tuple<unsigned int, float>&. The const tdp_uif_tuple& type you defined expands to const thrust::tuple<thrust::device_ptr<unsigned int>, thrust:device_ptr<float> >&
The code below compiles for me:
struct ZipComparator
{
__host__ __device__
inline bool operator() (const thrust::tuple<unsigned int, float> &a, const thrust::tuple<unsigned int, float> &b)
{
if(a.head < b.head) return true;
if(a.head == b.head) return a.tail < b.tail;
return false;
}
};
Hope it does for you as well :)
http://code.google.com/p/thrust/wiki/QuickStartGuide#zip_iterator has more details on the zip iterator.
Not required, but if you're looking to clean up the length of those templates, you can do this:
void thrustSort(
unsigned int * primaryKey,
float * secondaryKey,
unsigned int * values,
unsigned int numberOfPoints)
{
tdp_uint dev_ptr_pkey(primaryKey);
tdp_float dev_ptr_skey(secondaryKey);
tdp_uint dev_ptr_values(values);
thrust::tuple<tdp_uint, tdp_float> keytup_begin = thrust::make_tuple(dev_ptr_pkey, dev_ptr_skey);
thrust::zip_iterator<thrust::tuple<tdp_uint, tdp_float> > first =
thrust::make_zip_iterator(keytup_begin);
thrust::sort_by_key(first, first + numberOfPoints, dev_ptr_values, ZipComparator());
}
A lot of the template arguments can be inferred from the arguments.
This is a fully worked example on how using sort_by_key when the key is a tuple dealt with by zip_iterator's and a customized comparison operator.
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include "Utilities.cuh"
// --- Defining tuple type
typedef thrust::tuple<int, int> Tuple;
/**************************/
/* TUPLE ORDERING FUNCTOR */
/**************************/
struct TupleComp
{
__host__ __device__ bool operator()(const Tuple& t1, const Tuple& t2)
{
if (t1.get<0>() < t2.get<0>())
return true;
if (t1.get<0>() > t2.get<0>())
return false;
return t1.get<1>() < t2.get<1>();
}
};
/********/
/* MAIN */
/********/
int main()
{
const int N = 8;
// --- Keys and values on the host: allocation and definition
int h_keys1[N] = { 1, 3, 3, 3, 2, 3, 2, 1 };
int h_keys2[N] = { 1, 5, 3, 8, 2, 8, 1, 1 };
float h_values[N] = { 0.3, 5.1, 3.2, -0.08, 2.1, 5.2, 1.1, 0.01};
printf("\n\n");
printf("Original\n");
for (int i = 0; i < N; i++) {
printf("%i %i %f\n", h_keys1[i], h_keys2[i], h_values[i]);
}
// --- Keys and values on the device: allocation
int *d_keys1; gpuErrchk(cudaMalloc(&d_keys1, N * sizeof(int)));
int *d_keys2; gpuErrchk(cudaMalloc(&d_keys2, N * sizeof(int)));
float *d_values; gpuErrchk(cudaMalloc(&d_values, N * sizeof(float)));
// --- Keys and values: host -> device
gpuErrchk(cudaMemcpy(d_keys1, h_keys1, N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_keys2, h_keys2, N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_values, h_values, N * sizeof(float), cudaMemcpyHostToDevice));
// --- From raw pointers to device_ptr
thrust::device_ptr<int> dev_ptr_keys1 = thrust::device_pointer_cast(d_keys1);
thrust::device_ptr<int> dev_ptr_keys2 = thrust::device_pointer_cast(d_keys2);
thrust::device_ptr<float> dev_ptr_values = thrust::device_pointer_cast(d_values);
// --- Declare outputs
thrust::device_vector<float> d_values_output(N);
thrust::device_vector<Tuple> d_keys_output(N);
auto begin_keys = thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_keys1, dev_ptr_keys2));
auto end_keys = thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_keys1 + N, dev_ptr_keys2 + N));
thrust::sort_by_key(begin_keys, end_keys, dev_ptr_values, TupleComp());
int *h_keys1_output = (int *)malloc(N * sizeof(int));
int *h_keys2_output = (int *)malloc(N * sizeof(int));
float *h_values_output = (float *)malloc(N * sizeof(float));
gpuErrchk(cudaMemcpy(h_keys1_output, d_keys1, N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_keys2_output, d_keys2, N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_values_output, d_values, N * sizeof(float), cudaMemcpyDeviceToHost));
printf("\n\n");
printf("Ordered\n");
for (int i = 0; i < N; i++) {
printf("%i %i %f\n", h_keys1_output[i], h_keys2_output[i], h_values_output[i]);
}
}