How to lower CPU usage of finish_task_switch(), called by epoll_wait? - redis

I've written a simple epoll-driven server to bench network/io performance. The server simply receive a request and send a response immediately. It is slower than redis-server 'get', 38k/s vs 40k/s. Both use redis-benchmark as the load runner, and both used cpu up (>99%).
bench redis-server: redis-benchmark -n 1000000 -c 20 -t get -p 6379
bench myserver : redis-benchmark -n 1000000 -c 20 -t get -p 6399
I've profiled them using linux perf, eliminated epoll_ctl in myserver (as what redis-server does). Now the problems is function finish_task_switch() takes too much cpu time, about 10%-15% (for redis-server and redis-benchmark are 3%, on the same machine).
The call flow (read it top-down) is -> epoll_wait(25%) -> entry_SYSCALL_64_after_hwframe(23.56%) -> do_syscall_64(23.23%) -> sys_epoll_wait(22.36%) -> ep_poll(21.88%) -> schedule_hrtimeout_range(12.98%) -> schedule_hrtimeout_range_clock(12.74%) -> schedule(11.30%) -> _schedule(11.30%) -> finish_task_switch(10.82%)
I've tried writing the server using raw epoll api, and using redis's api in redis/src/ae.c, nothing changed.
I've examined how redis-server and redis-benchmark use epoll, no tricks found.
The redis CFLAGS is used for myserver, same as redis-benchmark.
The CPU usage has nothing to do with level/edge-triggered, block or nonblock client fd, whether epoll_wait's timeout set or not.
#include <sys/epoll.h>
#include <sys/socket.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h> // exit
#include <string.h> // memset
#include "anet.h"
#define MAX_EVENTS 32
typedef struct {
int fd;
char querybuf[256];
} client;
client *clients;
char err[256];
#define RESPONSE_REDIS "$128\r\nxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\r\n"
static int do_use_fd(client *c)
{
int n = read(c->fd, c->querybuf, sizeof(c->querybuf));
if (n == 0) { printf("Client Closed\n"); return n; }
n = write(c->fd, RESPONSE_REDIS, sizeof(RESPONSE_REDIS)-1);
return n;
}
int main()
{
struct epoll_event ev, events[MAX_EVENTS];
int listen_sock, conn_sock, nfds, epollfd;
epollfd = epoll_create(MAX_EVENTS);
listen_sock = anetTcpServer(err, 6399, NULL, MAX_EVENTS);
ev.events = EPOLLIN;
ev.data.fd = listen_sock;
epoll_ctl(epollfd, EPOLL_CTL_ADD, listen_sock, &ev);
clients = (client *)malloc(sizeof(client) * MAX_EVENTS);
memset(clients, 0, sizeof(client) * MAX_EVENTS);
for (;;) {
int n;
struct sockaddr addr;
socklen_t addrlen = sizeof(addr);
nfds = epoll_wait(epollfd, events, MAX_EVENTS, 100);
for (n = 0; n < nfds; ++n) {
if (events[n].data.fd == listen_sock) {
conn_sock = accept(listen_sock,
(struct sockaddr *) &addr, &addrlen);
anetNonBlock(err, conn_sock);
ev.events = EPOLLIN;
//ev.events = EPOLLIN | EPOLLET;
ev.data.fd = conn_sock;
epoll_ctl(epollfd, EPOLL_CTL_ADD, conn_sock,&ev);
clients[conn_sock].fd = conn_sock;
} else {
client *c = &clients[events[n].data.fd];
int ret = do_use_fd(c);
if (ret == 0) {
epoll_ctl(epollfd, EPOLL_CTL_DEL, c->fd, &ev);
}
}
}
}
}

Server's listen fd is blocked. set it nonblocked lowers the usage of finish_task_switch to <2%.

Related

How can I modify my code to read from a file input in client and then communicate it to a server?

Code:
//server.c
#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#include <WinSock2.h>
#include <Windows.h>
#define SCK_VERSION 0x0202
int main()
{
//declare sockets first
SOCKET ConSock; // used for connection, hence the name
SOCKET ListenSock; // used for listening from the server
SOCKADDR_IN address; //socket address
int addrsize = sizeof(address);
long ok;
char MESSAGE[200];
char reply[200];
WSADATA WSD;
WORD DllVersion;
DllVersion = MAKEWORD(2,1);
ok = WSAStartup(DllVersion, &WSD);
//start creating our sockets
ConSock = socket(AF_INET, SOCK_STREAM, NULL);
address.sin_addr.s_addr = inet_addr("127.0.0.1");
address.sin_family = AF_INET;
address.sin_port = htons(10103);
ListenSock = socket(AF_INET, SOCK_STREAM, NULL);
bind(ListenSock, (SOCKADDR*)&address, sizeof(address));
listen(ListenSock, SOMAXCONN);
printf("Server waiting for connections\n\n");
for(;;){
if (ConSock = accept(ListenSock, (SOCKADDR*)&address, &addrsize))
{
ok = recv(ConSock, MESSAGE, sizeof(MESSAGE),NULL);
printf("\nClient says:\t %s", MESSAGE);
printf("\nEnter reply:\t");
gets(reply);
ok = send(ConSock,reply,200,NULL);
}
}
}
//client.c
#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#include <WinSock2.h>
#include <Windows.h>
#define SCK_VERSION 0x0202
int main()
{
SOCKET sock;
SOCKADDR_IN address;
long ok;
char MESSAGE[200];
char reply[200];
WSADATA WSD;
WORD DllVersion;
DllVersion = MAKEWORD(2,1);
ok = WSAStartup(DllVersion, &WSD);
address.sin_addr.s_addr = inet_addr("127.0.0.1");
address.sin_family = AF_INET;
address.sin_port = htons(10103);
for(;;)
{
sock = socket(AF_INET,SOCK_STREAM,NULL);
connect(sock, (SOCKADDR*)&address, sizeof(address));
printf("\nEnter Message:\t");
gets(MESSAGE);
ok = send(sock,MESSAGE,200,NULL);
ok = recv(sock,reply,sizeof(reply),NULL);
printf("\nServer says:\t %s",reply);
}
}
//makefile
server:
g++ server.c -o server
client:
g++ client.c -o client
clean:
rm server
rm client
all: server client
This runs with -lwsock32 and it does a good job, but I need to adjust it now to comunicate some data from a file, the data is mostly values of voltages, currents, active reactive power, etc.
I have been trying for quite some time and I cannot seem to adjust it.
If you can rewrite it in terms of the client communicating the values(that are stored in a file) to the server, that will be great. Any help is greatly appreciated, as I am not being succesful currently.

Finding physical addresses of NVIDIA GPU memory for DMA

I am trying to find the physical PCIe address space memory locations of GPU memory to support inbound DMA initiated by an external PCIe resource such as an FPGA (similar to How to get physical address of GPU memory for DMA? (OpenCL)). We are specifically trying to avoid GPU-mastered operations or moving data indirectly through host memory.
I have a linux virtual memory address to physical memory addresses translation (see code below) which works great for decoding mmap including to other PCIe devices, and of course I know the physical addresses of the GPUs as reported by the "Region" in lspci -vvv
I have tried instrumenting the CUDA Sample memMapIPCDrv to examine the virtual and physical addresses for the output of cuMemMap and cuMemCreate; and I have tried a looking at the id(obj) and obj.gpu_data.device_ctypes_pointer.value virtual and physical memory addresses from python numba. In all cases, the resulting physical addresses are in host memory, not from the GPU address space. I'll include the python test case below.
Any ideas on how to find the actual on-GPU physical PCIe address?
Example code below uses GPU memory and then checks to see where associated physical memory is mapped to:
import numpy as np
import sys
import numba
from numba import cuda
import socket
import struct
from numba.cuda.cudadrv import driver as _driver
import re
import os
# Prepare for virtual to physical address translation
mySocket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
mySocket.connect("/run/pagemapd")
def do_translate(a):
"""Virtual to Physical Address translation"""
b = struct.pack('Q',a)
mySocket.sendall(b)
data = mySocket.recv(8)
return struct.unpack("Q", data)[0]
def hexsep(n):
"""Convert int to hex and then seperate digits into groups"""
return _hexsep(hex(n))
def _hexsep(H):
"""Seperate hex digits into groups (of 4, _ separated)"""
h = ''
while h != H:
h = H
H = re.sub(r'([0-9a-f]+)([0-9a-f]{4})(?:\b|_)', r'\1_\2', h)
return(h)
def p_translate(a):
"""Translate virtual address to physical and return both as string"""
pa = do_translate(a)
return f"v{hexsep(a)}->p{hexsep(pa)}"
if numba.cuda.is_available() is False:
print("no CUDA GPU found")
sys.exit()
# Create array, push it to GPU
n = np.ones(8192,dtype=np.uint8)*7
ngpu = cuda.mapped_array_like(n)
narr = numba.cuda.as_cuda_array(ngpu)
narr.copy_to_device(n)
# Print relevant physical and virtual addresses, and where memory lives
print(f"n addresses are: id(n)={p_translate(id(n))}, id(n.data)={p_translate(id(n.data))}, Memory is {'on' if _driver.is_device_memory(n) else 'off'} GPU")
print(f"ngpu addresses are: id(ngpu)={p_translate(id(ngpu))}, id(ngpu.data)={p_translate(id(ngpu.data))}, ngpu.gpu_data={p_translate(ngpu.gpu_data.device_ctypes_pointer.value)}, Memory is {'on' if _driver.is_device_memory(ngpu) else 'off'} GPU")
print(f"narr addresses are: id(narr)={p_translate(id(narr))}, narr.gpu_data={p_translate(narr.gpu_data.device_ctypes_pointer.value)}, Memory is {'on' if _driver.is_device_memory(ngpu) else 'off'} GPU")
# Print contents of array
print("ngpu",ngpu)
nn = narr.copy_to_host()
print("copy",nn)
# Set environmental variable DEBUG to anything to print out the mapping table, to see where addresses came from
if "DEBUG" in os.environ:
with open("/proc/self/maps") as R:
for l in R:
m = re.search(r'^([0-9a-f]{4,})-([0-9a-f]{4,})(.*)', l)
if m:
print(_hexsep(m.group(1))+"-"+_hexsep(m.group(2))+m.group(3),sep='')
else:
print(l,sep='')
// Service (run as root) translating virtual address to physical
#define _GNU_SOURCE
#include <sys/socket.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <errno.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <string.h>
#include <popt.h>
#include <arpa/inet.h>
#include <ctype.h>
#include <err.h>
#include <linux/un.h>
#include <pthread.h>
#include <sys/stat.h>
#define SPATH "/run/pagemapd"
long pagesize;
// Handle a particular client connection, performing translation
static void *thread_start(void *arg)
{
char buf[sizeof(uintptr_t)*6];
struct ucred ucred;
int ucredlen = sizeof(ucred);
pid_t targetpid = *(pid_t *)buf;
uintptr_t targetaddr = ((uintptr_t *)buf)[1];
int cfd = *(int *)arg;
int pfd = -1;
if (getsockopt(cfd, SOL_SOCKET, SO_PEERCRED, &ucred, &ucredlen) < 0)
{
warn("Cannot getsockopt");
goto endf;
}
snprintf(buf, sizeof(buf), "/proc/%d/pagemap", ucred.pid);
if ((pfd = open(buf, O_RDONLY)) < 0)
{
warn("Cannot open pagemap for %d from %s", ucred.pid, buf);
goto endf;
}
while (read(cfd, buf, sizeof(buf)) > 0)
{
uintptr_t targetaddr = *(uintptr_t *)buf;
uintptr_t entry = 0;
if (lseek(pfd, (uintptr_t)targetaddr / pagesize * sizeof(uintptr_t), SEEK_SET) < 0)
{
warn("Cannot seek for %d", ucred.pid);
goto endf;
}
if (read(pfd, &entry, sizeof(entry)) != sizeof(entry))
{
warn("Cannot read for %d", ucred.pid);
goto endf;
}
targetaddr = (((entry & 0x7fffffffffffffULL) * pagesize) + (((uintptr_t)targetaddr) % pagesize));
if (write(cfd, &targetaddr, sizeof(targetaddr)) != sizeof(targetaddr))
{
warn("Cannot write for %d", ucred.pid);
goto endf;
}
}
endf:
close(cfd);
if (pfd >= 0)
close(pfd);
}
// Set up socket and create thread per connection
int main()
{
int fd = socket(PF_UNIX, SOCK_STREAM, 0);
if (fd < 0)
err(1, "socket");
pagesize = sysconf(_SC_PAGESIZE);
struct sockaddr_un address, client;
int clientlen = sizeof(client);
memset(&address, 0, sizeof(struct sockaddr_un));
unlink(SPATH);
address.sun_family = AF_UNIX;
snprintf(address.sun_path, UNIX_PATH_MAX, SPATH);
if (bind(fd, (struct sockaddr *)&address, sizeof(address)) < 0)
err(1, "bind");
chmod(SPATH, 0666);
if (listen(fd, 5) < 0)
err(1, "listen");
char buf[sizeof(uintptr_t)*2];
struct ucred ucred;
int clientfd;
while ((clientfd = accept(fd, (struct sockaddr *)&client, &clientlen)) >= 0)
{
pthread_t child;
if (pthread_create(&child, NULL, thread_start, &clientfd) < 0)
err(2, "pthread_create");
}
}
Exposing CUDA memory for PCIe access requires kernel-driver calls. You can find a detailed explanation on how to do that in the GPUDirect RDMA documentation.

running gzip on sigle core in muticore environment under unix

I have a requirement to use only single core to test gzip performance in multi-core cpu environment(not sure what is the default settings for gzip in this case). Need help to find out the command to execute gzip compression in single core.
Thanks
gzip is single threaded by default so in effect it will look like it's running on one core ie it might run on several physical cores but it won't be in parallel.
If you absolutely must run on one core and you're on linux you would set affinity to a particular core.
http://man7.org/linux/man-pages/man2/sched_setaffinity.2.html
This is code that I got from the man page.
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/wait.h>
#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \
} while (0)
int
main(int argc, char *argv[])
{
cpu_set_t set;
int parentCPU, childCPU;
int nloops, j;
if (argc != 4) {
fprintf(stderr, "Usage: %s parent-cpu child-cpu num-loops\n",
argv[0]);
exit(EXIT_FAILURE);
}
parentCPU = atoi(argv[1]);
childCPU = atoi(argv[2]);
nloops = atoi(argv[3]);
CPU_ZERO(&set);
switch (fork()) {
case -1: /* Error */
errExit("fork");
case 0: /* Child */
CPU_SET(childCPU, &set);
if (sched_setaffinity(getpid(), sizeof(set), &set) == -1)
errExit("sched_setaffinity");
for (j = 0; j < nloops; j++)
getppid();
exit(EXIT_SUCCESS);
default: /* Parent */
CPU_SET(parentCPU, &set);
if (sched_setaffinity(getpid(), sizeof(set), &set) == -1)
errExit("sched_setaffinity");
for (j = 0; j < nloops; j++)
getppid();
wait(NULL); /* Wait for child to terminate */
exit(EXIT_SUCCESS);
}
}
If you need to test with no interruptions from the kernel you need to write a kernel module for that.

configuration of adafruit ultimate gps from raspberry pi does not work

I connected my ultimate gps breakout to my raspberry pi using the USB to TTL serial cable. Using C code I can easily connect to and read NMEA sentences from the GPS. But when I write configuration commands such as PMTK220 to set the update rate, they are ignored. I should get back a PMTK_ACK reporting success or failure, but it is not forthcoming. The problem also occurs when I use terminal windows. ie. I run:
while (true) do cat -A /dev/ttyUSB0 ; done
in one terminal, and get a stream of $GPGGA, $GPGSA, $GPRMC etc messages. In another terminal I run:
echo "$PMTK220,200*2C\r\n" > /dev/ttyUSB0
The NMEA messages continue but there is no PMTK001 coming back. Any ideas?
Ok, here's some code (inspired by gpsd source) that demonstrates sending a configuration message and getting an acknowledgement:
#define _BSD_SOURCE
#include <stdio.h>
#include <stdarg.h>
#include <sys/ioctl.h>
#include <termios.h>
#include <fcntl.h>
#include <unistd.h>
#include <assert.h>
#include <semaphore.h>
#include <string.h>
#ifndef CRTSCTS
# ifdef CNEW_RTSCTS
# define CRTSCTS CNEW_RTSCTS
# else
# define CRTSCTS 0
# endif /* CNEW_RTSCTS */
#endif /* !CRTSCTS */
int main (int argc, char **argv) {
int const baudr = B9600, mode = O_RDWR | O_NONBLOCK | O_NOCTTY;
int const fd = open("/dev/ttyUSB0", mode);
assert (fd != -1);
ioctl(fd, (unsigned long)TIOCEXCL);
struct termios ttyOld, ttyNew;
assert(tcgetattr(fd, &ttyOld) != -1);
ttyNew = ttyOld;
ttyNew.c_cflag &= ~(CSIZE | PARENB | PARODD | CRTSCTS | CSTOPB);
ttyNew.c_cflag |= CREAD | CLOCAL | CS8;
ttyNew.c_iflag = ttyNew.c_oflag = 0;
ttyNew.c_lflag = ICANON;
cfsetispeed(&ttyNew, baudr);
cfsetospeed(&ttyNew, baudr);
assert(tcsetattr(fd, TCSANOW, &ttyNew) != -1);
tcflush(fd, TCIOFLUSH);
usleep(200000);
tcflush(fd, TCIOFLUSH);
int const oldfl = fcntl(fd, F_GETFL);
assert (oldfl != -1);
fcntl(fd, F_SETFL, oldfl & ~O_NONBLOCK);
tcdrain(fd);
printf("port opened baudr=%d mode=%d i=%d o=%d c=%d l=%d fl=%d\n", baudr, mode, ttyNew.c_iflag, ttyNew.c_oflag, ttyNew.c_cflag, ttyNew.c_lflag, oldfl);
unsigned char buf[2048];
int count = 0;
while(++count < 20) {
if (count == 4) {
char const *const cmd = "$PMTK220,200*2C\r\n";
int const n = strlen(cmd);
assert(write(fd, cmd, n) == n);
tcdrain(fd);
printf("wrote command %d: %s\n", n, cmd);
}
int const n = read(fd, buf, sizeof(buf));
buf[(n >= sizeof(buf)) ? (sizeof(buf) - 1) : n] = 0;
printf(buf);
}
tcsetattr(fd, TCSANOW, &ttyOld);
close(fd);
}
with results:
pi#pi01 ~/c $ ./test
port opened baudr=13 mode=2306 i=0 o=0 c=3261 l=2 fl=2050
$GPGGA,175748.089,,,,,0,00,,,M,,M,,*71
$GPGSA,A,1,,,,,,,,,,,,,,,*1E
$GPRMC,175748.089,V,,,,,0.00,0.00,260715,,,N*43
wrote command 17: $PMTK220,200*2C
$GPVTG,0.00,T,,M,0.00,N,0.00,K,N*32
$PMTK001,220,2*31
$GPGGA,175749.089,,,,,0,00,,,M,,M,,*70
$GPGSA,A,1,,,,,,,,,,,,,,,*1E
$GPGSV,1,1,01,11,,,31*7A
$GPRMC,175749.089,V,,,,,0.00,0.00,260715,,,N*42
$GPVTG,0.00,T,,M,0.00,N,0.00,K,N*32
$GPGGA,175750.089,,,,,0,00,,,M,,M,,*78
$GPGSA,A,1,,,,,,,,,,,,,,,*1E
$GPRMC,175750.089,V,,,,,0.00,0.00,260715,,,N*4A
$GPVTG,0.00,T,,M,0.00,N,0.00,K,N*32
$GPGGA,175751.089,,,,,0,00,,,M,,M,,*79
$GPGSA,A,1,,,,,,,,,,,,,,,*1E
$GPRMC,175751.089,V,,,,,0.00,0.00,260715,,,N*4B
$GPVTG,0.00,T,,M,0.00,N,0.00,K,N*32
$GPGGA,175752.089,,,,,0,00,,,M,,M,,*7A
pi#pi01 ~/c $

Handle GPIO in User Space ARM9 Embedded Linux AM1808

I have to interface my GSM module with the AM1808 based on ARM9.
I have assigned all the GPIO pins to the Da850.c as well as mux.h files. I successfully created a uImage and inserted that image in my flash.
I need to handle some of that GPIO from User application.
I know that we can handle the GPIO from the Kerel space but i need to handle from the user space.
As for example I have assigned a GPIO for power key to GSM module. I need to change the pin means (HIGH or LOW) through application.
Ok i have written a following code to access it from the User Space,
#include <stdio.h>
#include <time.h>
#include <signal.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <unistd.h>
#include "GSMpwr.h"
#define BS_GSM_PWR_REGISTER 0x01E26014
#define BS_DCDS_MASK 0x00000004
int fd; // Memory device descriptor
unsigned long *pPWR;
unsigned short GetGSMpwr(void)
{
#if defined __HOST_ARM
unsigned long dcd_value = *pPWR;
return (pwr_value >> 7) & 0x01;
#endif
}
void InitializeGSMpwr(void)
{
#if defined __HOST_ARM
int page_size = getpagesize();
unsigned int MAP_addr;
unsigned int reg_addr;
unsigned char *pTemp; // Pointer to GSMpwr register
/*
* Open memory and get pointer to GSMpwr register in the FPGA
*/
if((fd = open("/dev/mem", O_RDWR | O_SYNC)) < 0)
{
printf("failed to open /dev/mem");
return;
}
else
{
MAP_addr = (BS_GSM_PWR_REGISTER & ~(page_size - 1));
pTemp = (unsigned char *)mmap(NULL, page_size,(PROT_READ | PROT_WRITE),MAP_SHARED,fd,MAP_addr);
if((pTemp == MAP_FAILED) || (pTemp == NULL))
{
printf("failed to map /dev/mem");
return;
}
else
{
printf(“Memory Mapped at Address %p. \n”,pTemp);
}
virt_addr = map_base + (control & MAP_MASK);
reg_addr = (BS_GSM_PWR_REGISTER & (page_size - 1));
pPWR = (unsigned long*)(pTemp + reg_addr);
printf("GSM PWR PIN mapped in Application\n");
}
I can only read that pin through this code, Now i want to use that pin as an output and want to go high and low with the time interval of 3sec.
The easiest way is to utilize GPIO support in sysfs, where you could control all the exported GPIO's. Please have a look at the Linux kernel GPIO documentation, in particular, Sysfs Interface for Userspace part.
After you have enabled GPIO support in sysfs (GPIO_SYSFS), the GPIO control would be as easy as:
Example
GPIO=22
cd /sys/class/gpio
ls
echo $GPIO > /sys/class/gpio/export
ls
Notice on the first ls that gpio22 doesn't exist, but does after you export GPIO 22 to user space.
cd /sys/class/gpio/gpio$GPIO
ls
There are files to set the direction and retrieve the current value.
echo "in" > direction
cat value
You can configure the GPIO for output and set the value as well.
echo "out" > direction
echo 1 > value
Example is taken from here.
I got it please find following code for that,I got the Specific pin address and i have accessed that pin like,
unsigned short GetGSMpwr(void)
{
unsigned long pwr_value = *pPWR;
printf("GSM_PWR:check Start : %ld",pwr_value);
return (pwr_value >> 1) & 0x01;
}
unsigned short SetGSMpwr(void)
{
unsigned long pwr_value = *pPWR;
printf("GSM_PWR:check Start : %ld",pwr_value);
*pPWR = ~((pwr_value >> 1) & 0x01);
}
unsigned short ClrGSMpwr(void)
{
unsigned long pwr_value = *pPWR;
printf("GSM_PWR:check Start : %ld",pwr_value);
*pPWR = 256;
}`