Is pow(x, 2.0) fast as x * x in GLSL? [duplicate] - optimization

Which is faster in GLSL:
pow(x, 3.0f);
or
x*x*x;
?
Does exponentiation performance depend on hardware vendor or exponent value?

I wrote a small benchmark, because I was interested in the results.
In my personal case, I was most interested in exponent = 5.
Benchmark code (running in Rem's Studio / LWJGL):
package me.anno.utils.bench
import me.anno.gpu.GFX
import me.anno.gpu.GFX.flat01
import me.anno.gpu.RenderState
import me.anno.gpu.RenderState.useFrame
import me.anno.gpu.framebuffer.Frame
import me.anno.gpu.framebuffer.Framebuffer
import me.anno.gpu.hidden.HiddenOpenGLContext
import me.anno.gpu.shader.Renderer
import me.anno.gpu.shader.Shader
import me.anno.utils.types.Floats.f2
import org.lwjgl.opengl.GL11.*
import java.nio.ByteBuffer
import kotlin.math.roundToInt
fun main() {
fun createShader(code: String) = Shader(
"", null, "" +
"attribute vec2 attr0;\n" +
"void main(){\n" +
" gl_Position = vec4(attr0*2.0-1.0, 0.0, 1.0);\n" +
" uv = attr0;\n" +
"}", "varying vec2 uv;\n", "" +
"void main(){" +
code +
"}"
)
fun repeat(code: String, times: Int): String {
return Array(times) { code }.joinToString("\n")
}
val size = 512
val warmup = 50
val benchmark = 1000
HiddenOpenGLContext.setSize(size, size)
HiddenOpenGLContext.createOpenGL()
val buffer = Framebuffer("", size, size, 1, 1, true, Framebuffer.DepthBufferType.NONE)
println("Power,Multiplications,GFlops-multiplication,GFlops-floats,GFlops-ints,GFlops-power,Speedup")
useFrame(buffer, Renderer.colorRenderer) {
RenderState.blendMode.use(me.anno.gpu.blending.BlendMode.ADD) {
for (power in 2 until 100) {
// to reduce the overhead of other stuff
val repeats = 100
val init = "float x1 = dot(uv, vec2(1.0)),x2,x4,x8,x16,x32,x64;\n"
val end = "gl_FragColor = vec4(x1,x1,x1,x1);\n"
val manualCode = StringBuilder()
for (bit in 1 until 32) {
val p = 1.shl(bit)
val h = 1.shl(bit - 1)
if (power == p) {
manualCode.append("x1=x$h*x$h;")
break
} else if (power > p) {
manualCode.append("x$p=x$h*x$h;")
} else break
}
if (power.and(power - 1) != 0) {
// not a power of two, so the result isn't finished yet
manualCode.append("x1=")
var first = true
for (bit in 0 until 32) {
val p = 1.shl(bit)
if (power.and(p) != 0) {
if (!first) {
manualCode.append('*')
} else first = false
manualCode.append("x$p")
}
}
manualCode.append(";\n")
}
val multiplications = manualCode.count { it == '*' }
// println("$power: $manualCode")
val shaders = listOf(
// manually optimized
createShader(init + repeat(manualCode.toString(), repeats) + end),
// can be optimized
createShader(init + repeat("x1=pow(x1,$power.0);", repeats) + end),
// can be optimized, int as power
createShader(init + repeat("x1=pow(x1,$power);", repeats) + end),
// slightly different, so it can't be optimized
createShader(init + repeat("x1=pow(x1,${power}.01);", repeats) + end),
)
for (shader in shaders) {
shader.use()
}
val pixels = ByteBuffer.allocateDirect(4)
Frame.bind()
glClearColor(0f, 0f, 0f, 1f)
glClear(GL_COLOR_BUFFER_BIT or GL_DEPTH_BUFFER_BIT)
for (i in 0 until warmup) {
for (shader in shaders) {
shader.use()
flat01.draw(shader)
}
}
val flops = DoubleArray(shaders.size)
val avg = 10 // for more stability between runs
for (j in 0 until avg) {
for (index in shaders.indices) {
val shader = shaders[index]
GFX.check()
val t0 = System.nanoTime()
for (i in 0 until benchmark) {
shader.use()
flat01.draw(shader)
}
// synchronize
glReadPixels(0, 0, 1, 1, GL_RGBA, GL_UNSIGNED_BYTE, pixels)
GFX.check()
val t1 = System.nanoTime()
// the first one may be an outlier
if (j > 0) flops[index] += multiplications * repeats.toDouble() * benchmark.toDouble() * size * size / (t1 - t0)
GFX.check()
}
}
for (i in flops.indices) {
flops[i] /= (avg - 1.0)
}
println(
"" +
"$power,$multiplications," +
"${flops[0].roundToInt()}," +
"${flops[1].roundToInt()}," +
"${flops[2].roundToInt()}," +
"${flops[3].roundToInt()}," +
(flops[0] / flops[3]).f2()
)
}
}
}
}
The sampler function is run 9x 512² pixels * 1000 times, and evaluates the function 100 times each.
I run this code on my RX 580, 8GB from Gigabyte, and collected the following results:
Power
#Mult
GFlops*
GFlopsFp
GFlopsInt
GFlopsPow
Speedup
2
1
1246
1429
1447
324
3.84
3
2
2663
2692
2708
651
4.09
4
2
2682
2679
2698
650
4.12
5
3
2766
972
974
973
2.84
6
3
2785
978
974
976
2.85
7
4
2830
1295
1303
1299
2.18
8
3
2783
2792
2809
960
2.90
9
4
2836
1298
1301
1302
2.18
10
4
2833
1291
1302
1298
2.18
11
5
2858
1623
1629
1623
1.76
12
4
2824
1302
1295
1303
2.17
13
5
2866
1628
1624
1626
1.76
14
5
2869
1614
1623
1611
1.78
15
6
2886
1945
1943
1953
1.48
16
4
2821
1305
1300
1305
2.16
17
5
2868
1615
1625
1619
1.77
18
5
2858
1620
1625
1624
1.76
19
6
2890
1949
1946
1949
1.48
20
5
2871
1618
1627
1625
1.77
21
6
2879
1945
1947
1943
1.48
22
6
2886
1944
1949
1952
1.48
23
7
2901
2271
2269
2268
1.28
24
5
2872
1621
1628
1624
1.77
25
6
2886
1942
1943
1942
1.49
26
6
2880
1949
1949
1953
1.47
27
7
2891
2273
2263
2266
1.28
28
6
2883
1949
1946
1953
1.48
29
7
2910
2279
2281
2279
1.28
30
7
2899
2272
2276
2277
1.27
31
8
2906
2598
2595
2596
1.12
32
5
2872
1621
1625
1622
1.77
33
6
2901
1953
1942
1949
1.49
34
6
2895
1948
1939
1944
1.49
35
7
2895
2274
2266
2268
1.28
36
6
2881
1937
1944
1948
1.48
37
7
2894
2277
2270
2280
1.27
38
7
2902
2275
2264
2273
1.28
39
8
2910
2602
2594
2603
1.12
40
6
2877
1945
1947
1945
1.48
41
7
2892
2276
2277
2282
1.27
42
7
2887
2271
2272
2273
1.27
43
8
2912
2599
2606
2599
1.12
44
7
2910
2278
2284
2276
1.28
45
8
2920
2597
2601
2600
1.12
46
8
2920
2600
2601
2590
1.13
47
9
2925
2921
2926
2927
1.00
48
6
2885
1935
1955
1956
1.47
49
7
2901
2271
2279
2288
1.27
50
7
2904
2281
2276
2278
1.27
51
8
2919
2608
2594
2607
1.12
52
7
2902
2282
2270
2273
1.28
53
8
2903
2598
2602
2598
1.12
54
8
2918
2602
2602
2604
1.12
55
9
2932
2927
2924
2936
1.00
56
7
2907
2284
2282
2281
1.27
57
8
2920
2606
2604
2610
1.12
58
8
2913
2593
2597
2587
1.13
59
9
2925
2923
2924
2920
1.00
60
8
2930
2614
2606
2613
1.12
61
9
2932
2946
2946
2947
1.00
62
9
2926
2935
2937
2947
0.99
63
10
2958
3258
3192
3266
0.91
64
6
2902
1957
1956
1959
1.48
65
7
2903
2274
2267
2273
1.28
66
7
2909
2277
2276
2286
1.27
67
8
2908
2602
2606
2599
1.12
68
7
2894
2272
2279
2276
1.27
69
8
2923
2597
2606
2606
1.12
70
8
2910
2596
2599
2600
1.12
71
9
2926
2921
2927
2924
1.00
72
7
2909
2283
2273
2273
1.28
73
8
2909
2602
2602
2599
1.12
74
8
2914
2602
2602
2603
1.12
75
9
2924
2925
2927
2933
1.00
76
8
2904
2608
2602
2601
1.12
77
9
2911
2919
2917
2909
1.00
78
9
2927
2921
2917
2935
1.00
79
10
2929
3241
3246
3246
0.90
80
7
2903
2273
2276
2275
1.28
81
8
2916
2596
2592
2589
1.13
82
8
2913
2600
2597
2598
1.12
83
9
2925
2931
2926
2913
1.00
84
8
2917
2598
2606
2597
1.12
85
9
2920
2916
2918
2927
1.00
86
9
2942
2922
2944
2936
1.00
87
10
2961
3254
3259
3268
0.91
88
8
2934
2607
2608
2612
1.12
89
9
2918
2939
2931
2916
1.00
90
9
2927
2928
2920
2924
1.00
91
10
2940
3253
3252
3246
0.91
92
9
2924
2933
2926
2928
1.00
93
10
2940
3259
3237
3251
0.90
94
10
2928
3247
3247
3264
0.90
95
11
2933
3599
3593
3594
0.82
96
7
2883
2282
2268
2269
1.27
97
8
2911
2602
2595
2600
1.12
98
8
2896
2588
2591
2587
1.12
99
9
2924
2939
2936
2938
1.00
As you can see, a power() call takes exactly as long as 9 multiplication instructions. Therefore every manual rewriting of a power with less than 9 multiplications is faster.
Only the cases 2, 3, 4, and 8 are optimized by my driver. The optimization is independent of whether you use the .0 suffix for the exponent.
In the case of exponent = 2, my implementation seems to have lower performance than the driver. I am not sure, why.
The speedup is the manual implementation compared to pow(x,exponent+0.01), which cannot be optimized by the compiler.
Because the multiplications and the speedup align so perfectly, I created a graph to show the relationship. This relationship kind of shows that my benchmark is trustworthy :).
Operating System: Windows 10 Personal
GPU: RX 580 8GB from Gigabyte
Processor: Ryzen 5 2600
Memory: 16 GB DDR4 3200
GPU Driver: 21.6.1 from 17th June 2021
LWJGL: Version 3.2.3 build 13

While this can definitely be hardware/vendor/compiler dependent, advanced mathematical functions like pow() tend to be considerably more expensive than basic operations.
The best approach is of course to try both, and benchmark. But if there is a simple replacement for an advanced mathematical functions, I don't think you can go very wrong by using it.
If you write pow(x, 3.0), the best you can probably hope for is that the compiler will recognize the special case, and expand it. But why take the risk, if the replacement is just as short and easy to read? C/C++ compilers don't always replace pow(x, 2.0) by a simple multiplication, so I wouldn't necessarily count on all GLSL compilers to do that.

Related

How do i delete rows that contains just a specific word in pandas [duplicate]

I have the following DataFrame:
daysago line_race rating rw wrating
line_date
2007-03-31 62 11 56 1.000000 56.000000
2007-03-10 83 11 67 1.000000 67.000000
2007-02-10 111 9 66 1.000000 66.000000
2007-01-13 139 10 83 0.880678 73.096278
2006-12-23 160 10 88 0.793033 69.786942
2006-11-09 204 9 52 0.636655 33.106077
2006-10-22 222 8 66 0.581946 38.408408
2006-09-29 245 9 70 0.518825 36.317752
2006-09-16 258 11 68 0.486226 33.063381
2006-08-30 275 8 72 0.446667 32.160051
2006-02-11 475 5 65 0.164591 10.698423
2006-01-13 504 0 70 0.142409 9.968634
2006-01-02 515 0 64 0.134800 8.627219
2005-12-06 542 0 70 0.117803 8.246238
2005-11-29 549 0 70 0.113758 7.963072
2005-11-22 556 0 -1 0.109852 -0.109852
2005-11-01 577 0 -1 0.098919 -0.098919
2005-10-20 589 0 -1 0.093168 -0.093168
2005-09-27 612 0 -1 0.083063 -0.083063
2005-09-07 632 0 -1 0.075171 -0.075171
2005-06-12 719 0 69 0.048690 3.359623
2005-05-29 733 0 -1 0.045404 -0.045404
2005-05-02 760 0 -1 0.039679 -0.039679
2005-04-02 790 0 -1 0.034160 -0.034160
2005-03-13 810 0 -1 0.030915 -0.030915
2004-11-09 934 0 -1 0.016647 -0.016647
I need to remove the rows where line_race is equal to 0. What's the most efficient way to do this?
If I'm understanding correctly, it should be as simple as:
df = df[df.line_race != 0]
But for any future bypassers you could mention that df = df[df.line_race != 0] doesn't do anything when trying to filter for None/missing values.
Does work:
df = df[df.line_race != 0]
Doesn't do anything:
df = df[df.line_race != None]
Does work:
df = df[df.line_race.notnull()]
just to add another solution, particularly useful if you are using the new pandas assessors, other solutions will replace the original pandas and lose the assessors
df.drop(df.loc[df['line_race']==0].index, inplace=True)
If you want to delete rows based on multiple values of the column, you could use:
df[(df.line_race != 0) & (df.line_race != 10)]
To drop all rows with values 0 and 10 for line_race.
In case of multiple values and str dtype
I used the following to filter out given values in a col:
def filter_rows_by_values(df, col, values):
return df[~df[col].isin(values)]
Example:
In a DataFrame I want to remove rows which have values "b" and "c" in column "str"
df = pd.DataFrame({"str": ["a","a","a","a","b","b","c"], "other": [1,2,3,4,5,6,7]})
df
str other
0 a 1
1 a 2
2 a 3
3 a 4
4 b 5
5 b 6
6 c 7
filter_rows_by_values(df, "str", ["b","c"])
str other
0 a 1
1 a 2
2 a 3
3 a 4
Though the previous answer are almost similar to what I am going to do, but using the index method does not require using another indexing method .loc(). It can be done in a similar but precise manner as
df.drop(df.index[df['line_race'] == 0], inplace = True)
The best way to do this is with boolean masking:
In [56]: df
Out[56]:
line_date daysago line_race rating raw wrating
0 2007-03-31 62 11 56 1.000 56.000
1 2007-03-10 83 11 67 1.000 67.000
2 2007-02-10 111 9 66 1.000 66.000
3 2007-01-13 139 10 83 0.881 73.096
4 2006-12-23 160 10 88 0.793 69.787
5 2006-11-09 204 9 52 0.637 33.106
6 2006-10-22 222 8 66 0.582 38.408
7 2006-09-29 245 9 70 0.519 36.318
8 2006-09-16 258 11 68 0.486 33.063
9 2006-08-30 275 8 72 0.447 32.160
10 2006-02-11 475 5 65 0.165 10.698
11 2006-01-13 504 0 70 0.142 9.969
12 2006-01-02 515 0 64 0.135 8.627
13 2005-12-06 542 0 70 0.118 8.246
14 2005-11-29 549 0 70 0.114 7.963
15 2005-11-22 556 0 -1 0.110 -0.110
16 2005-11-01 577 0 -1 0.099 -0.099
17 2005-10-20 589 0 -1 0.093 -0.093
18 2005-09-27 612 0 -1 0.083 -0.083
19 2005-09-07 632 0 -1 0.075 -0.075
20 2005-06-12 719 0 69 0.049 3.360
21 2005-05-29 733 0 -1 0.045 -0.045
22 2005-05-02 760 0 -1 0.040 -0.040
23 2005-04-02 790 0 -1 0.034 -0.034
24 2005-03-13 810 0 -1 0.031 -0.031
25 2004-11-09 934 0 -1 0.017 -0.017
In [57]: df[df.line_race != 0]
Out[57]:
line_date daysago line_race rating raw wrating
0 2007-03-31 62 11 56 1.000 56.000
1 2007-03-10 83 11 67 1.000 67.000
2 2007-02-10 111 9 66 1.000 66.000
3 2007-01-13 139 10 83 0.881 73.096
4 2006-12-23 160 10 88 0.793 69.787
5 2006-11-09 204 9 52 0.637 33.106
6 2006-10-22 222 8 66 0.582 38.408
7 2006-09-29 245 9 70 0.519 36.318
8 2006-09-16 258 11 68 0.486 33.063
9 2006-08-30 275 8 72 0.447 32.160
10 2006-02-11 475 5 65 0.165 10.698
UPDATE: Now that pandas 0.13 is out, another way to do this is df.query('line_race != 0').
The given answer is correct nontheless as someone above said you can use df.query('line_race != 0') which depending on your problem is much faster. Highly recommend.
Another way of doing it. May not be the most efficient way as the code looks a bit more complex than the code mentioned in other answers, but still alternate way of doing the same thing.
df = df.drop(df[df['line_race']==0].index)
One of the efficient and pandaic way is using eq() method:
df[~df.line_race.eq(0)]
I compiled and run my code. This is accurate code. You can try it your own.
data = pd.read_excel('file.xlsx')
If you have any special character or space in column name you can write it in '' like in the given code:
data = data[data['expire/t'].notnull()]
print (date)
If there is just a single string column name without any space or special
character you can directly access it.
data = data[data.expire ! = 0]
print (date)
Adding one more way to do this.
df = df.query("line_race!=0")
There are various ways to achieve that. Will leave below various options, that one can use, depending on specificities of one's use case.
One will consider that OP's dataframe is stored in the variable df.
Option 1
For OP's case, considering that the only column with values 0 is the line_race, the following will do the work
df_new = df[df != 0].dropna()
[Out]:
line_date daysago line_race rating rw wrating
0 2007-03-31 62 11.0 56 1.000000 56.000000
1 2007-03-10 83 11.0 67 1.000000 67.000000
2 2007-02-10 111 9.0 66 1.000000 66.000000
3 2007-01-13 139 10.0 83 0.880678 73.096278
4 2006-12-23 160 10.0 88 0.793033 69.786942
5 2006-11-09 204 9.0 52 0.636655 33.106077
6 2006-10-22 222 8.0 66 0.581946 38.408408
7 2006-09-29 245 9.0 70 0.518825 36.317752
8 2006-09-16 258 11.0 68 0.486226 33.063381
9 2006-08-30 275 8.0 72 0.446667 32.160051
10 2006-02-11 475 5.0 65 0.164591 10.698423
However, as that is not always the case, would recommend checking the following options where one will specify the column name.
Option 2
tshauck's approach ends up being better than Option 1, because one is able to specify the column. There are, however, additional variations depending on how one wants to refer to the column:
For example, using the position in the dataframe
df_new = df[df[df.columns[2]] != 0]
Or by explicitly indicating the column as follows
df_new = df[df['line_race'] != 0]
One can also follow the same login but using a custom lambda function, such as
df_new = df[df.apply(lambda x: x['line_race'] != 0, axis=1)]
[Out]:
line_date daysago line_race rating rw wrating
0 2007-03-31 62 11.0 56 1.000000 56.000000
1 2007-03-10 83 11.0 67 1.000000 67.000000
2 2007-02-10 111 9.0 66 1.000000 66.000000
3 2007-01-13 139 10.0 83 0.880678 73.096278
4 2006-12-23 160 10.0 88 0.793033 69.786942
5 2006-11-09 204 9.0 52 0.636655 33.106077
6 2006-10-22 222 8.0 66 0.581946 38.408408
7 2006-09-29 245 9.0 70 0.518825 36.317752
8 2006-09-16 258 11.0 68 0.486226 33.063381
9 2006-08-30 275 8.0 72 0.446667 32.160051
10 2006-02-11 475 5.0 65 0.164591 10.698423
Option 3
Using pandas.Series.map and a custom lambda function
df_new = df['line_race'].map(lambda x: x != 0)
[Out]:
line_date daysago line_race rating rw wrating
0 2007-03-31 62 11.0 56 1.000000 56.000000
1 2007-03-10 83 11.0 67 1.000000 67.000000
2 2007-02-10 111 9.0 66 1.000000 66.000000
3 2007-01-13 139 10.0 83 0.880678 73.096278
4 2006-12-23 160 10.0 88 0.793033 69.786942
5 2006-11-09 204 9.0 52 0.636655 33.106077
6 2006-10-22 222 8.0 66 0.581946 38.408408
7 2006-09-29 245 9.0 70 0.518825 36.317752
8 2006-09-16 258 11.0 68 0.486226 33.063381
9 2006-08-30 275 8.0 72 0.446667 32.160051
10 2006-02-11 475 5.0 65 0.164591 10.698423
Option 4
Using pandas.DataFrame.drop as follows
df_new = df.drop(df[df['line_race'] == 0].index)
[Out]:
line_date daysago line_race rating rw wrating
0 2007-03-31 62 11.0 56 1.000000 56.000000
1 2007-03-10 83 11.0 67 1.000000 67.000000
2 2007-02-10 111 9.0 66 1.000000 66.000000
3 2007-01-13 139 10.0 83 0.880678 73.096278
4 2006-12-23 160 10.0 88 0.793033 69.786942
5 2006-11-09 204 9.0 52 0.636655 33.106077
6 2006-10-22 222 8.0 66 0.581946 38.408408
7 2006-09-29 245 9.0 70 0.518825 36.317752
8 2006-09-16 258 11.0 68 0.486226 33.063381
9 2006-08-30 275 8.0 72 0.446667 32.160051
10 2006-02-11 475 5.0 65 0.164591 10.698423
Option 5
Using pandas.DataFrame.query as follows
df_new = df.query('line_race != 0')
[Out]:
line_date daysago line_race rating rw wrating
0 2007-03-31 62 11.0 56 1.000000 56.000000
1 2007-03-10 83 11.0 67 1.000000 67.000000
2 2007-02-10 111 9.0 66 1.000000 66.000000
3 2007-01-13 139 10.0 83 0.880678 73.096278
4 2006-12-23 160 10.0 88 0.793033 69.786942
5 2006-11-09 204 9.0 52 0.636655 33.106077
6 2006-10-22 222 8.0 66 0.581946 38.408408
7 2006-09-29 245 9.0 70 0.518825 36.317752
8 2006-09-16 258 11.0 68 0.486226 33.063381
9 2006-08-30 275 8.0 72 0.446667 32.160051
10 2006-02-11 475 5.0 65 0.164591 10.698423
Option 6
Using pandas.DataFrame.drop and pandas.DataFrame.query as follows
df_new = df.drop(df.query('line_race == 0').index)
[Out]:
line_date daysago line_race rating rw wrating
0 2007-03-31 62 11.0 56 1.000000 56.000000
1 2007-03-10 83 11.0 67 1.000000 67.000000
2 2007-02-10 111 9.0 66 1.000000 66.000000
3 2007-01-13 139 10.0 83 0.880678 73.096278
4 2006-12-23 160 10.0 88 0.793033 69.786942
5 2006-11-09 204 9.0 52 0.636655 33.106077
6 2006-10-22 222 8.0 66 0.581946 38.408408
7 2006-09-29 245 9.0 70 0.518825 36.317752
8 2006-09-16 258 11.0 68 0.486226 33.063381
9 2006-08-30 275 8.0 72 0.446667 32.160051
10 2006-02-11 475 5.0 65 0.164591 10.698423
Option 7
If one doesn't have strong opinions on the output, one can use a vectorized approach with numpy.select
df_new = np.select([df != 0], [df], default=np.nan)
[Out]:
[['2007-03-31' 62 11.0 56 1.0 56.0]
['2007-03-10' 83 11.0 67 1.0 67.0]
['2007-02-10' 111 9.0 66 1.0 66.0]
['2007-01-13' 139 10.0 83 0.880678 73.096278]
['2006-12-23' 160 10.0 88 0.793033 69.786942]
['2006-11-09' 204 9.0 52 0.636655 33.106077]
['2006-10-22' 222 8.0 66 0.581946 38.408408]
['2006-09-29' 245 9.0 70 0.518825 36.317752]
['2006-09-16' 258 11.0 68 0.486226 33.063381]
['2006-08-30' 275 8.0 72 0.446667 32.160051]
['2006-02-11' 475 5.0 65 0.164591 10.698423]]
This can also be converted to a dataframe with
df_new = pd.DataFrame(df_new, columns=df.columns)
[Out]:
line_date daysago line_race rating rw wrating
0 2007-03-31 62 11.0 56 1.0 56.0
1 2007-03-10 83 11.0 67 1.0 67.0
2 2007-02-10 111 9.0 66 1.0 66.0
3 2007-01-13 139 10.0 83 0.880678 73.096278
4 2006-12-23 160 10.0 88 0.793033 69.786942
5 2006-11-09 204 9.0 52 0.636655 33.106077
6 2006-10-22 222 8.0 66 0.581946 38.408408
7 2006-09-29 245 9.0 70 0.518825 36.317752
8 2006-09-16 258 11.0 68 0.486226 33.063381
9 2006-08-30 275 8.0 72 0.446667 32.160051
10 2006-02-11 475 5.0 65 0.164591 10.698423
With regards to the most efficient solution, that would depend on how one wants to measure efficiency. Assuming that one wants to measure the time of execution, one way that one can go about doing it is with time.perf_counter().
If one measures the time of execution for all the options above, one gets the following
method time
0 Option 1 0.00000110000837594271
1 Option 2.1 0.00000139995245262980
2 Option 2.2 0.00000369996996596456
3 Option 2.3 0.00000160001218318939
4 Option 3 0.00000110000837594271
5 Option 4 0.00000120000913739204
6 Option 5 0.00000140001066029072
7 Option 6 0.00000159995397552848
8 Option 7 0.00000150001142174006
However, this might change depending on the dataframe one uses, on the requirements (such as hardware), and more.
Notes:
There are various suggestions on using inplace=True. Would suggest reading this: https://stackoverflow.com/a/59242208/7109869
There are also some people with strong opinions on .apply(). Would suggest reading this: When should I (not) want to use pandas apply() in my code?
If one has missing values, one might want to consider as well pandas.DataFrame.dropna. Using the option 2, it would be something like
df = df[df['line_race'] != 0].dropna()
There are additional ways to measure the time of execution, so I would recommend this thread: How do I get time of a Python program's execution?
Just adding another way for DataFrame expanded over all columns:
for column in df.columns:
df = df[df[column]!=0]
Example:
def z_score(data,count):
threshold=3
for column in data.columns:
mean = np.mean(data[column])
std = np.std(data[column])
for i in data[column]:
zscore = (i-mean)/std
if(np.abs(zscore)>threshold):
count=count+1
data = data[data[column]!=i]
return data,count
Just in case you need to delete the row, but the value can be in different columns.
In my case I was using percentages so I wanted to delete the rows which has a value 1 in any column, since that means that it's the 100%
for x in df:
df.drop(df.loc[df[x]==1].index, inplace=True)
Is not optimal if your df have too many columns.
so many options provided(or maybe i didnt pay much attention to it, sorry if its the case), but no one mentioned this:
we can use this notation in pandas: ~ (this gives us the inverse of the condition)
df = df[~df["line_race"] == 0]
It doesn't make much difference for simple example like this, but for complicated logic, I prefer to use drop() when deleting rows because it is more straightforward than using inverse logic. For example, delete rows where A=1 AND (B=2 OR C=3).
Here's a scalable syntax that is easy to understand and can handle complicated logic:
df.drop( df.query(" `line_race` == 0 ").index)
You can try using this:
df.drop(df[df.line_race != 0].index, inplace = True)
.

When does URL to .csv open or download file?

I'm learning some Python pandas and the course uses https://gist.githubusercontent.com/sh7ata/e075ff35b51ebb0d2d577fbe1d19ebc9/raw/b966d02c7c26bcca60703acb1390e938a65a35cb/drinks.csv
Clicking this link opens the actual .csv file contents in my browser and I can read the data into pandas straight away.
However, this doesn't work for https://www.spss-tutorials.com/downloads/browsers.csv. If I click this link, Google Chrome downloads the file rather than show its contents.
Why is this and what can I do about it? I mean, they're both URLs for .csv files, right?
You can use requests module with custom HTTP header to download it. For example:
import requests
import pandas as pd
from io import StringIO
url = "https://www.spss-tutorials.com/downloads/browsers.csv"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0"
}
req = requests.get(url, headers=headers)
df = pd.read_csv(StringIO(req.text))
print(df.to_markdown())
Prints:
screen_resolution
sessions
perc_new_sessions
new_users
bounce_rate
pages_session
avg_session_duration
goal_conversion_rate
goal_completions
goal_value
0
1366x768
2,284
79.60%
1,818
69.40%
1.93
00:02:14
0.00%
0
€0.00
1
1920x1080
2,013
72.28%
1,455
71.93%
2.02
00:02:18
0.00%
0
€0.00
2
1280x1024
1,217
72.14%
878
74.53%
1.9
00:02:05
0.00%
0
€0.00
3
1680x1050
1,052
68.16%
717
74.62%
1.93
00:01:46
0.00%
0
€0.00
4
1440x900
921
77.85%
717
74.05%
1.73
00:01:45
0.00%
0
€0.00
5
1280x800
865
80.00%
692
71.91%
1.76
00:01:37
0.00%
0
€0.00
6
1600x900
737
76.39%
563
72.86%
1.8
00:02:02
0.00%
0
€0.00
7
1920x1200
441
64.85%
286
73.92%
1.87
00:01:55
0.00%
0
€0.00
8
1024x768
192
88.02%
169
73.96%
2.07
00:01:32
0.00%
0
€0.00
9
2560x1440
137
67.15%
92
61.31%
1.86
00:02:02
0.00%
0
€0.00
10
1280x720
134
82.84%
111
66.42%
2.16
00:01:15
0.00%
0
€0.00
11
1536x864
118
78.81%
93
72.03%
1.78
00:01:45
0.00%
0
€0.00
12
320x568
104
84.62%
88
75.00%
1.89
00:01:18
0.00%
0
€0.00
13
768x1024
91
83.52%
76
67.03%
2.66
00:02:11
0.00%
0
€0.00
14
1360x768
70
77.14%
54
74.29%
1.69
00:01:08
0.00%
0
€0.00
15
360x640
70
71.43%
50
77.14%
2.06
00:02:06
0.00%
0
€0.00
16
1600x1200
62
80.65%
50
82.26%
1.32
00:01:22
0.00%
0
€0.00
17
1344x840
56
44.64%
25
53.57%
3.11
00:04:39
0.00%
0
€0.00
18
320x480
51
80.39%
41
72.55%
1.61
00:00:55
0.00%
0
€0.00
19
1093x614
41
80.49%
33
78.05%
1.76
00:01:42
0.00%
0
€0.00
20
1280x768
38
60.53%
23
68.42%
2.63
00:02:41
0.00%
0
€0.00
21
1024x600
35
94.29%
33
85.71%
1.37
00:01:23
0.00%
0
€0.00
...and so on

Jedis benchmarking on local Redis server

I'm using JMH to test the performance of Jedis on a local Redis server (Jedis version 2.9.0, Redis version 6.2.6, CPU Quad-Core Intel Core i5). I use 200 threads to send SET command within a connection pool.
#State(Scope.Benchmark)
public class CommonClientBenchmark {
private JedisPool jedisPool;
private final String host = "127.0.0.1";
private final int port = 6379;
#Setup
public void setup() {
JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
jedisPoolConfig.setMaxTotal(200);
jedisPoolConfig.setMaxIdle(200);
jedisPool = new JedisPool(jedisPoolConfig, host, port, 30000);
}
#TearDown
public void tearDown() {
jedisPool.close();
}
#Threads(200)
#Fork(1)
#Benchmark
#BenchmarkMode(Mode.Throughput)
#Warmup(iterations = 1, time = 30, timeUnit = TimeUnit.SECONDS)
#Measurement(iterations = 2, time = 30, timeUnit = TimeUnit.SECONDS)
public void jedisSet() {
try (Jedis jedis = jedisPool.getResource()) {
jedis.set("jedis", "jedis");
}
}
public static void main(String[] args) throws IOException, RunnerException {
CommonClientBenchmark commonClientBenchmark = new CommonClientBenchmark();
commonClientBenchmark.setup();
org.openjdk.jmh.Main.main(args);
}
}
With the code above, I obtain about 25000+ QPS. However, when I decrease the maxTotal and maxIdle parameter of the connection pool from 200 to 100, the result QPS is even much higher - it reaches about 75000. Could anyone explain the phenomenon? Thanks a lot!
EDIT: I've change the version of Jedis to 4.1.1 and run multiple benchmarking tests, the result is similar. When the size of connection pool is set to 100 (both maxTotal and maxIdle), I obtain about 25000 ~ 50000 QPS. When I increase the size (both maxTotal and maxIdle) to 200, the QPS rise to 60000 ~ 75000.
I've also use iostat 1 to monitor the usage of CPU while running the tests. And I found that when the pool size is set to 200, the %system is often much higher than when it is set to 100.
connection pool size set to 200:
disk0 cpu load average
KB/t tps MB/s us sy id 1m 5m 15m
4.09 929 3.71 7 87 6 39.58 14.44 8.06
4.00 902 3.52 6 89 5 39.58 14.44 8.06
4.50 8 0.04 5 88 6 38.33 14.60 8.15
4.39 145 0.62 6 89 6 38.33 14.60 8.15
28.00 11 0.30 6 88 5 38.33 14.60 8.15
8.00 1 0.01 5 88 6 38.33 14.60 8.15
0.00 0 0.00 5 88 7 38.33 14.60 8.15
4.00 5 0.02 5 88 7 38.94 15.12 8.37
0.00 0 0.00 5 89 6 38.94 15.12 8.37
0.00 0 0.00 5 88 7 38.94 15.12 8.37
0.00 0 0.00 5 89 6 38.94 15.12 8.37
8.68 222 1.88 5 88 7 38.94 15.12 8.37
5.60 10 0.05 5 87 8 45.20 16.81 9.01
29.65 46 1.33 11 82 7 45.20 16.81 9.01
52.57 7 0.36 8 85 7 45.20 16.81 9.01
28.00 2 0.05 5 87 8 45.20 16.81 9.01
223.33 6 1.31 6 87 7 45.20 16.81 9.01
4.19 1344 5.49 8 85 7 44.54 17.15 9.17
4.61 952 4.29 6 89 5 44.54 17.15 9.17
4.00 690 2.69 6 89 5 44.54 17.15 9.17
connection pool size set to 100:
disk0 cpu load average
KB/t tps MB/s us sy id 1m 5m 15m
4.31 13 0.05 16 59 26 6.55 7.86 7.49
750.67 3 2.20 30 53 17 6.58 7.85 7.48
9.14 225 2.01 23 54 23 6.58 7.85 7.48
37.00 8 0.29 23 56 21 6.58 7.85 7.48
32.00 6 0.19 18 55 26 6.58 7.85 7.48
145.20 10 1.41 22 56 22 6.58 7.85 7.48
0.00 0 0.00 22 56 22 6.46 7.80 7.47
4.00 2660 10.39 24 58 18 6.46 7.80 7.47
4.00 1952 7.62 19 56 25 6.46 7.80 7.47
4.00 1 0.00 19 56 24 6.46 7.80 7.47
4.00 5 0.02 18 56 27 6.46 7.80 7.47
0.00 0 0.00 15 57 28 6.10 7.71 7.44
0.00 0 0.00 18 57 25 6.10 7.71 7.44
256.00 10 2.50 18 56 25 6.10 7.71 7.44
6.29 7 0.04 20 57 23 6.10 7.71 7.44
4.00 5 0.02 20 56 24 6.10 7.71 7.44
17.71 7 0.12 20 56 24 6.01 7.66 7.42
23.00 4 0.09 20 58 23 6.01 7.66 7.42
5.00 4 0.02 23 55 22 6.01 7.66 7.42
4.00 1 0.00 20 56 24 6.01 7.66 7.42

Using beautifulsoup to extract information within pre, turning it into a table to csv

I am trying to extract the text from the below site shown within the code.
While I can print the list fine, I can't seem to turn it into a pandas dataframe, and print it out as a csv.
This is a site that only has the pre info.
Please let me know if there is way to do this.
import requests
from bs4 import BeautifulSoup
#url list for the new stations
url1="https://www.kyoshin.bosai.go.jp/cgi-bin/kyoshin/db/sitedat.cgi?1+NIG010+knet"
tt1="C:/temp/"
page = requests.get(url1)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup)
N-Value P,S-Velocity Density Soil Column
(m/s) (g/cm^3)
----------------------------------------------------------------------------------
1m 13 1351 93 1.43 0m - 1m Fl
2m 9 1351 105 1.77 1m - 7.75m S
3m 11 1389 102 1.86 7.75m - 15.15m S
4m 7 1408 104 1.83 15.15m - 16.75m S
5m 20 1429 120 1.74 16.75m - 19.3m SF
6m 20 1481 121 1.89 19.3m - 22.75m SF
7m 24 1538 143 1.97 22.75m - 25.7m M
8m 53 1550 189 1.87 25.7m - 33.44m S
9m 52 1550 233 1.85
10m 47 1504 222 1.93
11m 43 1493 206 1.9
12m 38 1504 222 1.89
13m 27 1492 213 1.84
14m 44 1492 213 1.9
15m 62 1527 235 1.89
16m 46 1504 189 1.92
17m 22 1481 165 1.87
18m 26 1471 147 1.86
19m 24 1493 202 1.82
20m 21 1493 198 1.87
Not the most robust, but you can iterate line by line to parse the data you need:
import requests
import pandas as pd
from io import StringIO
#url list for the new stations
url1="https://www.kyoshin.bosai.go.jp/cgi-bin/kyoshin/db/sitedat.cgi?1+NIG010+knet"
tt1="C:/temp/"
page = requests.get(url1)
s=str(page.content,'utf-8')
df = pd.DataFrame()
for lineNum, line in enumerate(s.splitlines()):
if lineNum == 0:
headers = line.split()
elif lineNum == 1:
unit1, unit2 = line.split()
elif lineNum == 2:
continue
else:
row = line.split()
idx = row[0]
nval = row[1]
ps = row[2] + ' ' + row[3]
den = row[4]
try:
soil = '%s %s %s' %(row[5], row[6], row[7])
col = row[8]
except Exception as e:
print(e)
soil = ''
col = ''
temp_df = pd.DataFrame([[idx, nval, ps, den, soil, col]],
columns = ['Index',headers[0], headers[1] + ' ' + unit1, headers[2] + ' ' + unit2, headers[3], headers[4]])
df = df.append(temp_df, sort=False).reset_index(drop=True)
df.to_csv('file.csv',index=False)
Output:
print (df)
Index N-Value P,S-Velocity (m/s) Density (g/cm^3) Soil Column
0 1m 13 1351 93 1.43 0m - 1m Fl
1 2m 9 1351 105 1.77 1m - 7.75m S
2 3m 11 1389 102 1.86 7.75m - 15.15m S
3 4m 7 1408 104 1.83 15.15m - 16.75m S
4 5m 20 1429 120 1.74 16.75m - 19.3m SF
5 6m 20 1481 121 1.89 19.3m - 22.75m SF
6 7m 24 1538 143 1.97 22.75m - 25.7m M
7 8m 53 1550 189 1.87 25.7m - 33.44m S
8 9m 52 1550 233 1.85
9 10m 47 1504 222 1.93
10 11m 43 1493 206 1.9
11 12m 38 1504 222 1.89
12 13m 27 1492 213 1.84
13 14m 44 1492 213 1.9
14 15m 62 1527 235 1.89
15 16m 46 1504 189 1.92
16 17m 22 1481 165 1.87
17 18m 26 1471 147 1.86
18 19m 24 1493 202 1.82
19 20m 21 1493 198 1.87

Syntax error with AMPL

I get syntax error while running this script in AMPL. Can someone help me to solve this?
param K;
param N;
param PT;
param beta_lower{1..K};
param beta_upper{1..K};
set KSET := {1 . . K};
set NSET := {1 . . N};
param Channel {KSET,NSET};
var V
var C {KSET, NSET} binary;
#==================================
data;
param K:=2;
param N:=64;
param PT:= 1;
param beta_lower:= 1 1.99 2 3.99;
param beta_upper:= 1 2.01 2 4.01;
param Channel : 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 :=
1 1366 1474 1583 1690 1790 1881 1963 2036 2101 2161 2217 2268 2315 2355 2385 2402 2403 2386 2350 2295 2223 2137 2041 1939 1835 1734 1639 1553 1479 1419 1375 1347 1335 1339 1357 1386 1421 1459 1494 1523 1542 1548 1540 1520 1490 1451 1409 1364 1321 1279 1239 1201 1164 1127 1092 1060 1034 1016 1012 1024 1055 1107 1178 1265
2 1297 1281 1250 1201 1135 1055 963 867 772 685 611 555 519 504 510 536 579 636 702 775 851 928 1002 1074 1143 1209 1276 1345 1420 1503 1596 1698 1808 1921 2033 2137 2225 2290 2327 2333 2309 2256 2180 2089 1989 1890 1796 1712 1641 1582 1533 1493 1458 1425 1393 1364 1337 1314 1298 1289 1288 1292 1297 1301;
I write this piece of code in tex file (.rtf) and upload this to neos-server
The output from the solver is:
amplin, line 7 (offset 54):
syntax error
context: >>> {\ <<< rtf1\ansi\ansicpg1252\cocoartf12processing commands.
Executing on neos-2.neos-server.org
Error (2) in /opt/ampl/ampl -R amplin
The problem is that you mix syntax for AMPL model and data in your code. In particular, you should first declare parameter beta_lower in the model:
param beta_lower{1..K}; # you may want a different indexing expression here
and then provide data for it in the data section:
data;
param beta_lower:= 1 1.99 2 3.99;
Your updated formulation looks correct, but the error indicates that it is in RTF format. To make it work you should convert it into plain text.