Ggpredict over fitting causing trend line to begin in an odd place - ggplot2

I am not entirely sure what I am doing wrong/what to look up. My objective is to use ggpredict and ggplot to display the relationship between time and the proportion of years burnt. I'm guessing it is something to do with the time variable being log transformed?
library(lme4);library(ggplot2);library(ggeffects);library(dplyr)
data = read.csv('FtmpAllyrs10kmC.csv')
This is what the data looks like:
structure(list(Observ = c(5208, 2828, 1664, 578, 18, 1644, 4741,
751, 689, 3813, 1464, 438, 1553, 4752, 4960, 376, 2482, 1811,
5682, 5441, 4505, 2281, 2103, 2993, 562, 4297, 3592, 5148, 3793,
1621, 1912, 1627, 1737, 4976, 2173, 5132, 5758, 2756, 1789, 5666,
2628, 2593, 794, 5779, 5158, 3123, 4986, 676, 4200, 2442, 2751,
4330, 1802, 2020, 2500, 1056, 959, 3290, 4303, 247, 5586, 922,
1049, 2432, 2076, 2560, 1369, 3636, 3722, 4137, 1561, 4915, 2515,
3034, 5547, 1491, 1247, 4116, 455, 4687, 1697, 5329, 21, 5724,
3701, 5697, 2938, 1721, 61, 998, 4304, 5798, 651, 910, 2689,
3986, 2908, 5753, 2574, 2345, 1940, 4317, 4588, 2179, 665, 4133,
749, 3977, 3134, 4190, 3985, 4937, 2473, 3238, 4987, 3915, 4261,
3521, 2736, 3665, 1797, 5692, 5578, 4087, 2011, 903, 889, 1523,
3396, 2291, 5269, 3644, 3403, 4814, 4618, 16, 77, 5385, 2842,
5816, 2015, 1443, 3183, 3331, 4977, 5380, 989, 4918, 740, 4637,
887, 1557, 4295, 4673, 1918, 5662, 4167, 1384, 3441, 614, 2360,
780, 661, 1267, 2018, 1906, 3402, 677, 5218, 2830, 4979, 3984,
4924, 1125, 2640, 986, 1885, 2573, 5300, 2398, 4832, 4816, 3738,
3276, 3830, 2425, 2054, 4273, 5607, 1678, 378, 1158, 510, 2210,
2399, 1952, 2909, 4945, 2659, 2642), yrblock15 = c(2015, 2010,
2007, 2005, 2004, 2007, 2014, 2005, 2005, 2012, 2007, 2004, 2007,
2014, 2015, 2004, 2009, 2008, 2016, 2016, 2014, 2009, 2008, 2010,
2005, 2013, 2011, 2015, 2012, 2007, 2008, 2007, 2007, 2015, 2008,
2015, 2016, 2010, 2007, 2016, 2009, 2009, 2005, 2016, 2015, 2010,
2015, 2005, 2013, 2009, 2010, 2013, 2008, 2008, 2009, 2006, 2006,
2011, 2013, 2004, 2016, 2006, 2006, 2009, 2008, 2009, 2007, 2012,
2012, 2013, 2007, 2014, 2009, 2010, 2016, 2007, 2006, 2013, 2005,
2014, 2007, 2015, 2004, 2016, 2012, 2016, 2010, 2007, 2004, 2006,
2013, 2016, 2005, 2006, 2009, 2012, 2010, 2016, 2009, 2009, 2008,
2013, 2014, 2008, 2005, 2013, 2005, 2012, 2010, 2013, 2012, 2014,
2009, 2011, 2015, 2012, 2013, 2011, 2010, 2012, 2007, 2016, 2016,
2013, 2008, 2006, 2005, 2007, 2011, 2009, 2015, 2012, 2011, 2014,
2014, 2004, 2004, 2015, 2010, 2016, 2008, 2007, 2011, 2011, 2015,
2015, 2006, 2014, 2005, 2014, 2005, 2007, 2013, 2014, 2008, 2016,
2013, 2007, 2011, 2005, 2009, 2005, 2005, 2006, 2008, 2008, 2011,
2005, 2015, 2010, 2015, 2012, 2014, 2006, 2009, 2006, 2008, 2009,
2015, 2009, 2014, 2014, 2012, 2011, 2012, 2009, 2008, 2013, 2016,
2007, 2004, 2006, 2005, 2008, 2009, 2008, 2010, 2014, 2009, 2009
), circleID = c(258, 128, 314, 128, 18, 294, 241, 301, 239, 213,
114, 438, 203, 252, 10, 376, 232, 11, 282, 41, 5, 31, 303, 293,
112, 247, 442, 198, 193, 271, 112, 277, 387, 26, 373, 182, 358,
56, 439, 266, 378, 343, 344, 379, 208, 423, 36, 226, 150, 192,
51, 280, 2, 220, 250, 156, 59, 140, 253, 247, 186, 22, 149, 182,
276, 310, 19, 36, 122, 87, 211, 415, 265, 334, 147, 141, 347,
66, 5, 187, 347, 379, 21, 324, 101, 297, 238, 371, 61, 98, 254,
398, 201, 10, 439, 386, 208, 353, 324, 95, 140, 267, 88, 379,
215, 83, 299, 377, 434, 140, 385, 437, 223, 88, 37, 315, 211,
371, 36, 65, 447, 292, 178, 37, 211, 3, 439, 173, 246, 41, 319,
44, 253, 314, 118, 16, 77, 435, 142, 416, 215, 93, 33, 181, 27,
430, 89, 418, 290, 137, 437, 207, 245, 173, 118, 262, 117, 34,
291, 164, 110, 330, 211, 367, 218, 106, 252, 227, 268, 130, 29,
384, 424, 225, 390, 86, 85, 323, 350, 148, 332, 316, 138, 126,
230, 175, 254, 223, 207, 328, 378, 258, 60, 410, 149, 152, 209,
445, 409, 392), rain15 = c(347.83, 394.12, 382.2, 382.41, 395.7,
386.08, 383.79, 352.65, 354.31, 366.48, 416.79, 335.17, 409.24,
373, 390.76, 341.35, 387.25, 452.18, 329.14, 365.74, 432.58,
443.36, 375.57, 359.75, 379.14, 386.41, 361.47, 366.1, 382.57,
383.32, 409.56, 390.92, 380.38, 394.94, 366.72, 347.44, 336.88,
410.94, 370.83, 335.88, 368.53, 370.42, 344.56, 323.41, 348.34,
351.07, 382.75, 362.64, 402.7, 396.11, 418.01, 389.14, 462.76,
391.05, 369.47, 399.78, 419.32, 392.97, 389.15, 345.37, 336.22,
405.73, 378.45, 394.7, 388.29, 379.56, 437.29, 415.95, 388.91,
402.43, 397.09, 368.84, 378.54, 361.92, 355.22, 416.46, 361.24,
417.12, 420.92, 386.48, 375.04, 335.03, 385.23, 342.51, 401.27,
341.21, 362.81, 372.85, 396.48, 390.72, 385.06, 343.64, 365.25,
440.76, 364.68, 354.45, 368.7, 324.44, 366.4, 408.43, 405.71,
390.8, 401.09, 364.07, 360.68, 399.39, 348.38, 344.2, 345.23,
401.29, 356.48, 364.21, 376.12, 403.37, 384.1, 355.71, 389.53,
363.28, 417.76, 403.16, 362.28, 333.91, 337.46, 419.51, 389.22,
448.08, 338.46, 397.52, 372.25, 424.25, 349.25, 408.19, 376.68,
375.87, 403.78, 398.73, 386.92, 340.39, 391.58, 335.03, 390.25,
422.05, 423.79, 386.49, 392.97, 334.07, 403.85, 369.54, 348.84,
392.33, 336.68, 399.56, 386.84, 395.97, 409.93, 337.08, 410.27,
450.48, 364.93, 369.08, 413.31, 341.93, 360.06, 362.28, 395.8,
423.56, 376.67, 366.19, 358.88, 390.74, 390.84, 362.84, 370.21,
360.84, 371.9, 410.36, 421.59, 367.48, 355.62, 389.61, 370.81,
374.37, 382.61, 401.78, 373.7, 382.72, 387.56, 388.53, 329.06,
383.78, 336.97, 376.68, 398.57, 370.46, 388.88, 421.66, 369.29,
371.58, 369.01, 369.22), YearsBurnt = c(6, 6, 3.5, 5, 3, 2, 3.5,
2.5, 2, 1.5, 10.5, 3.5, 2.5, 3.5, 4.5, 3, 2, 2.5, 1.5, 3.5, 3.5,
4, 4, 3, 3.5, 2.5, 6, 4.5, 4, 2.5, 3.5, 2, 7, 3, 2.5, 3.5, 13,
3, 3.5, 3.5, 4.5, 3, 1.5, 2, 4, 2, 4.5, 4, 3.5, 2.5, 2, 2, 3,
1, 5, 2.5, 4, 12.5, 2.5, 1.5, 3.5, 1.5, 2.5, 4, 4.5, 10, 3, 3.5,
4.5, 10.5, 1, 4.5, 2, 13.5, 8.5, 10, 1, 4, 3, 3.5, 1.5, 3, 2.5,
2.5, 2.5, 4.5, 4, 1.5, 3, 3.5, 4.5, 1.5, 3, 2.5, 3.5, 8.5, 4,
7, 2.5, 5, 11, 3.5, 11.5, 3, 1.5, 3, 0.5, 4.5, 3.5, 13.5, 7.5,
3.5, 2, 12, 4, 5, 2, 1.5, 3.5, 4.5, 2, 3.5, 3, 4, 1.5, 2, 2.5,
6, 2, 5, 3.5, 4.5, 2, 3.5, 5, 4.5, 3, 4, 14, 3, 1.5, 3.5, 5.5,
3, 4, 3, 7, 4.5, 2.5, 3, 3, 3.5, 3, 9, 5, 6.5, 5, 4, 4, 3.5,
3, 8.5, 1, 4.5, 1.5, 5.5, 3, 2, 2.5, 2.5, 3, 8.5, 2.5, 1, 3.5,
5.5, 5, 1.5, 2, 4.5, 5, 4, 1.5, 3.5, 4.5, 6, 4.5, 3.5, 3, 6.5,
3, 6.5, 3.5, 4.5, 2.5, 2.5, 4, 4, 4, 4.5), YearsNotBurnt = c(9,
9, 11.5, 10, 12, 13, 11.5, 12.5, 13, 13.5, 4.5, 11.5, 12.5, 11.5,
10.5, 12, 13, 12.5, 13.5, 11.5, 11.5, 11, 11, 12, 11.5, 12.5,
9, 10.5, 11, 12.5, 11.5, 13, 8, 12, 12.5, 11.5, 2, 12, 11.5,
11.5, 10.5, 12, 13.5, 13, 11, 13, 10.5, 11, 11.5, 12.5, 13, 13,
12, 14, 10, 12.5, 11, 2.5, 12.5, 13.5, 11.5, 13.5, 12.5, 11,
10.5, 5, 12, 11.5, 10.5, 4.5, 14, 10.5, 13, 1.5, 6.5, 5, 14,
11, 12, 11.5, 13.5, 12, 12.5, 12.5, 12.5, 10.5, 11, 13.5, 12,
11.5, 10.5, 13.5, 12, 12.5, 11.5, 6.5, 11, 8, 12.5, 10, 4, 11.5,
3.5, 12, 13.5, 12, 14.5, 10.5, 11.5, 1.5, 7.5, 11.5, 13, 3, 11,
10, 13, 13.5, 11.5, 10.5, 13, 11.5, 12, 11, 13.5, 13, 12.5, 9,
13, 10, 11.5, 10.5, 13, 11.5, 10, 10.5, 12, 11, 1, 12, 13.5,
11.5, 9.5, 12, 11, 12, 8, 10.5, 12.5, 12, 12, 11.5, 12, 6, 10,
8.5, 10, 11, 11, 11.5, 12, 6.5, 14, 10.5, 13.5, 9.5, 12, 13,
12.5, 12.5, 12, 6.5, 12.5, 14, 11.5, 9.5, 10, 13.5, 13, 10.5,
10, 11, 13.5, 11.5, 10.5, 9, 10.5, 11.5, 12, 8.5, 12, 8.5, 11.5,
10.5, 12.5, 12.5, 11, 11, 11, 10.5), time = c(1.96, 4.94, 3.46,
4.94, 2.73, 6.22, 4.5, 2.67, 4.66, 3.83, 0.38, 2.6, 3.97, 4.18,
3.77, 3.44, 2.9, 3.93, 2.16, 3.51, 2.91, 3.19, 2.73, 6.36, 1.74,
4.39, 4.1, 2.26, 2.36, 5.32, 1.74, 3.66, 1.26, 5.61, 9.04, 4.61,
0.46, 3.98, 2.63, 5.5, 2.56, 5.92, 6.39, 2.26, 3.27, 7.95, 2.93,
4.93, 2.97, 2.43, 5.91, 3.07, 4.27, 3.21, 4.12, 4.72, 1.93, 0.69,
3.51, 4.39, 4.02, 3.18, 2.61, 4.61, 3.67, 0.54, 2.33, 2.93, 2.12,
1.06, 3.95, 2.31, 5.44, 0.17, 1.42, 0.55, 8.35, 2.53, 2.91, 3.26,
8.35, 2.26, 2.23, 7.18, 6.59, 6.36, 4.38, 7.67, 1.93, 3.34, 2.91,
8.54, 5.75, 3.77, 2.63, 0.97, 3.27, 1.58, 7.18, 2.08, 0.69, 5.43,
0.85, 2.26, 3.69, 3.18, 6.18, 2.93, 2.68, 0.69, 0.92, 2.34, 3.26,
0.85, 2.91, 4.3, 3.95, 7.67, 2.93, 2.1, 6.54, 6.31, 3.87, 2.91,
3.95, 3.35, 2.63, 1.49, 4.32, 3.51, 7.06, 2.67, 3.51, 3.46, 1.56,
4.33, 5.64, 2.73, 0.57, 2.87, 3.69, 2.56, 2.33, 4.27, 4.73, 4.02,
0.82, 4.11, 4.88, 2.29, 2.34, 3.72, 4.21, 1.49, 1.56, 3.03, 1.24,
2.65, 5.71, 1.67, 2.71, 1.49, 3.95, 4.51, 3.36, 5.21, 4.18, 4.54,
5.36, 4.25, 3.71, 0.95, 8.92, 3.12, 2.73, 1.36, 1.85, 7.24, 8.11,
2.2, 0.95, 5.16, 1.3, 6.54, 3.01, 1.97, 2.91, 3.26, 3.72, 1.79,
2.56, 1.96, 1.89, 1.89, 2.61, 5.25, 3.25, 5.26, 1.74, 3.73),
claylake = c(0, 0, 0, 0, 0, 17.53, 0.1, 0.59, 0, 9.13, 36.93,
12.75, 0, 0, 0, 0, 0, 0, 0, 0.09, 0.01, 0, 0, 9.43, 74.71,
26.42, 0.23, 0, 0, 35.27, 74.71, 0, 0, 0, 0, 0, 0, 0, 20.81,
9.46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1.14, 0, 26.42, 3.62, 0, 0, 0, 0.21, 0, 0, 0, 0.03, 10.43,
0.99, 3.6, 5.32, 0, 0.36, 0, 0, 0.25, 0.01, 0.22, 0, 0, 6.45,
0, 0, 0, 0, 0, 1.71, 0, 0, 0, 0, 0, 20.81, 0, 0, 0.18, 0,
0, 1.14, 0.03, 1.2, 0, 8.97, 0, 0, 0, 0, 1.14, 0, 1.56, 0.22,
1.2, 0, 0, 0.99, 0, 0, 0, 0, 4.14, 0, 0, 0.99, 0, 20.81,
0, 33.61, 0.09, 14.94, 0, 0, 0, 0, 0.41, 0, 2.7, 0, 0.61,
8.97, 0, 0, 0, 0, 1.7, 2.67, 7.71, 0.2, 8.63, 1.56, 0, 0.49,
0, 0, 0, 0, 0, 11.9, 33.08, 0, 0, 0.99, 2.13, 0, 0, 0, 0,
0.03, 0, 0, 0, 0, 0, 0, 2.86, 1.65, 0, 0, 0, 0, 0, 60.14,
0, 0, 0, 0, 0.22, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0, 5.57
), spinsandplain = c(81.94, 34.29, 89.55, 34.29, 80.86, 75.92,
81.55, 43.53, 97.3, 87.84, 60.62, 80.81, 11.73, 5.11, 98.67,
79.52, 60.73, 91.65, 2.82, 97.31, 73.65, 72.78, 96.51, 74.02,
25.09, 50.74, 96.62, 88.77, 98.8, 54.04, 25.09, 95.1, 69.85,
99.4, 78.79, 78.77, 48.16, 80.68, 75.79, 66.33, 68.3, 79.11,
91.89, 82.49, 98.33, 90.82, 91.24, 65.01, 69.24, 99.94, 99.75,
18.57, 90.39, 95.56, 71.07, 67.85, 92.37, 85.85, 17.89, 50.74,
79.65, 68.82, 74.05, 78.77, 87.67, 41.11, 91.74, 91.24, 44.8,
86.24, 97.7, 94.17, 85.59, 33.53, 85.23, 94.55, 78.52, 95.49,
73.65, 95.04, 78.52, 82.49, 77.26, 83.4, 98.29, 85.24, 98.78,
87.09, 81.36, 96.62, 3.4, 94.65, 28.6, 98.67, 75.79, 73.34,
98.33, 74.88, 83.4, 88.24, 85.85, 52.44, 95.84, 82.49, 62.11,
98.74, 70.32, 86.18, 95.67, 85.85, 11.42, 85.96, 75.53, 95.84,
95.46, 93.68, 97.7, 87.09, 91.24, 80.03, 87.77, 68.71, 17.51,
95.46, 97.7, 50.7, 75.79, 70.43, 61.06, 97.31, 74.63, 99,
17.89, 89.55, 99.25, 98.08, 97.61, 93.36, 99.03, 38.1, 62.11,
96.9, 88.87, 40.48, 90.21, 73.79, 95.2, 66.53, 96.67, 82.89,
85.96, 97.08, 75.74, 70.43, 99.25, 96.4, 98.88, 98.13, 85.32,
54.19, 99.2, 81.42, 97.7, 82.25, 97.42, 98.1, 5.11, 12.06,
66.14, 52.39, 52.72, 12.32, 87.32, 98.95, 71.55, 90.58, 97.9,
80.62, 93.32, 76, 86.48, 86.42, 39.54, 68.65, 6.05, 86.02,
3.4, 75.53, 97.08, 32.47, 68.3, 81.94, 89.64, 57.4, 74.05,
0.47, 96.76, 86.7, 78.46, 84.81)), row.names = c(5208L, 2828L,
1664L, 578L, 18L, 1644L, 4741L, 751L, 689L, 3813L, 1464L, 438L,
1553L, 4752L, 4960L, 376L, 2482L, 1811L, 5682L, 5441L, 4505L,
2281L, 2103L, 2993L, 562L, 4297L, 3592L, 5148L, 3793L, 1621L,
1912L, 1627L, 1737L, 4976L, 2173L, 5132L, 5758L, 2756L, 1789L,
5666L, 2628L, 2593L, 794L, 5779L, 5158L, 3123L, 4986L, 676L,
4200L, 2442L, 2751L, 4330L, 1802L, 2020L, 2500L, 1056L, 959L,
3290L, 4303L, 247L, 5586L, 922L, 1049L, 2432L, 2076L, 2560L,
1369L, 3636L, 3722L, 4137L, 1561L, 4915L, 2515L, 3034L, 5547L,
1491L, 1247L, 4116L, 455L, 4687L, 1697L, 5329L, 21L, 5724L, 3701L,
5697L, 2938L, 1721L, 61L, 998L, 4304L, 5798L, 651L, 910L, 2689L,
3986L, 2908L, 5753L, 2574L, 2345L, 1940L, 4317L, 4588L, 2179L,
665L, 4133L, 749L, 3977L, 3134L, 4190L, 3985L, 4937L, 2473L,
3238L, 4987L, 3915L, 4261L, 3521L, 2736L, 3665L, 1797L, 5692L,
5578L, 4087L, 2011L, 903L, 889L, 1523L, 3396L, 2291L, 5269L,
3644L, 3403L, 4814L, 4618L, 16L, 77L, 5385L, 2842L, 5816L, 2015L,
1443L, 3183L, 3331L, 4977L, 5380L, 989L, 4918L, 740L, 4637L,
887L, 1557L, 4295L, 4673L, 1918L, 5662L, 4167L, 1384L, 3441L,
614L, 2360L, 780L, 661L, 1267L, 2018L, 1906L, 3402L, 677L, 5218L,
2830L, 4979L, 3984L, 4924L, 1125L, 2640L, 986L, 1885L, 2573L,
5300L, 2398L, 4832L, 4816L, 3738L, 3276L, 3830L, 2425L, 2054L,
4273L, 5607L, 1678L, 378L, 1158L, 510L, 2210L, 2399L, 1952L,
2909L, 4945L, 2659L, 2642L), class = "data.frame")
I create a new variable as the proportion of years burnt is out of 15 years (i.e., binomial)
data$fireprop = cbind(data$YearsBurnt,data$YearsNotBurnt)
Model:
mfireprop = glmer(fireprop~log(time)+spinsandplain+rain15+claylake+rain15*log(time)+(1|circleID),na.action=na.fail, family=binomial, data=data)
Trend line code:
d = ggpredict(mfireprop, terms = "time[exp]")
d = rename(d, "time" = x, "fireprop" = predicted)
ggplot(d, aes(time, fireprop)) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .1) +
geom_line(size = 2, colour = "black") +
theme_bw()
And the trend line comes out looking like this:
Why is the x axis not stopping at 10 hours where the data stops? Why is it going to 20,000? And why does the y axis only go to 0.4? When some of the proportions are 1?
When I limit the x and y axis it ends up looking like this:
But when I look at the raw data over the top of that, it seems like the trend line is starting off in a really odd place.
I am unsure of what I am doing wrong?

Okay, so I've figured out the main problem here. In the documentation of the ggpredict() function there is an argument called back.transform that defaults to TRUE. This means that log-transformed data will be automatically transformed back to the original response scale. This is why if you examine the ggpredict object d, you will see that the time variable actually does go to over 8000 in that object. So because you did not flag back.transform=FALSE, but also specified time[exp], what happened was the function automatically exponentiated your values, and then you did it again.
If we look at the logged values:
summary(log(data$time))
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.7720 0.8154 1.1802 1.0904 1.4793 2.2017
Then we exponentiate the max value, we get the previous max:
exp(2.2017) # Exponentiated to get back to years
[1] 9.040369
summary(df$time) # The original variable
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.170 2.260 3.255 3.519 4.390 9.040
If we exponentiate it again, we end up with the max time being over 8000.
exp(9.040369)
[1] 8436.89
So, to get the plot you want, you just need to leave out the [exp] after calling time in ggpredict():
d = ggpredict(mfireprop, terms = "time")
d = rename(d, "time" = x, "fireprop" = predicted)
ggplot(d, aes(time, fireprop)) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .1) +
geom_line(size = 2, colour = "black") +
theme_bw()
The time is being cut off because at time 0 there is no variation. YearsNotBurnt is always 0. Therefore, if you look at the object d from ggpredict, you will see NaN in all the columns for time 0. If you simplify the model to the following:
mfireprop2 = glmer(fireprop~
log(time) +
(1|circleID),
na.action=na.fail,
family=binomial,
data=df)
You will be able to get the plot, but because there is very little variation, the confidence interval will span from one to zero. I believe this is an issue related to separation, basically it means that binomial models can't be fit in frequentist models if there is no variation, or if something perfectly predicts the outcome.
The only I think I wanted to mention is that you had a question in the comments about "non-integer counts in a binomial glm!". This is because it expects the dependent variable to be a proportion of trials, which should not have decimals. You have points in your data that seem to be half-year intervals. I'm not familiar with your data enough to say for sure what a better alternative would be, but creating a proportion and giving the number of observations in the weights= argument might be an option.

Related

Sorting Pandas dataframe by multiple conditions

I have a large dataframe (thousands of rows by hundreds of columns), a short excerpt is as the following:
data = {'Step':['', '', '', 'First', 'First', 'Second', 'Third', 'Second', 'First', 'Second', 'First', 'First', 'Second', 'Second'],
'Stuff':['tot', 'white', 'random', 7583, 3563, 824, 521, 7658, 2045, 33, 9823, 5, 8090, 51],
'Mark':['marking', '', '', 1, 5, 5, 5, 1, 27, 27, 1, 6, 1, 9],
'A':['item_a', 100, 'st1', 142, 2, 2, 2, 100, 150, 105, 118, 118, 162, 156],
'B':['skill', 66, 'abc', 160, 2, 130, 140, 169, 1, 2, 130, 140, 144, 127],
'C':['item', 50, 'st1', 2000, 2, 65, 2001, 1999, 1, 2, 2000, 4, 2205, 2222],
'D':['item_c', 100, 'st1', 433, 430, 150, 170, 130, 1, 2, 300, 4, 291, 606],
'E':['test', 90, 'st1', 111, 130, 5, 10, 160, 1, 2, 232, 4, 144, 113],
'F':['done', 80, 'abc', 765, 755, 5, 10, 160, 1, 2, 733, 4, 666, 500],
'G':['nd', 90, 'mag', 500, 420, 5, 10, 160, 1, 2, 300, 4, 469, 500],
'H':['prt', 100, 'st1', 999, 200, 5, 10, 160, 1, 2, 477, 4, 620, 7],
'Name':['NS', '', '', "Pat", "Lucy", "Lucy", "Lucy", "Nick", "Kirk", "Kirk", "Joe", "Nico", "Nico", "Bryan"],
'Value':[ -1, 0, 0, 0, 3, 6, 5, 0, 7, 7, 0, 6, 0, 1]}
df = pd.DataFrame(data)
I need to sort this dataframe according to the following conditions that have to be satisfied all together:
In the "Name" column, names that are the same are to remain grouped (e.g. there are 3
records of "Lucy" next to each other, and they cannot be moved apart)
For each group of names, the appearance order has to remain the one
given by the "Step" column (e.g. the first appearance of "Lucy" is
related to the value "First" in the "Step" column, the second to
"Second" and so on)
All the remaining names that in the "Value" column have a value = 0,
have to be moved below the others (e.g. "Pat" can be moved after the
others, but not "Nico" because there are two records of "Nico" and
the other one has a value = 6)
The first three rows cannot be moved
What I have done is to concatenate different sub-dataframes:
df_groupnames=df[df.duplicated(subset=['Name'], keep=False)]
df_nogroup = df[~df.duplicated(subset=['Name'], keep=False)]
df_nogroup_high = df_nogroup[df_nogroup["Value"] > 0 ]
df_nogroup_null = df_nogroup[df_nogroup["Value"] == 0]
# Let's concatenate these dataframes to get the sorted one
df_sorted = pd.concat([df_groupnames, df_nogroup_high, df_nogroup_null])
It works, but I wonder if there's a smarter, simpler way, and maybe faster, to obtain the same.
Thank you for your attention.

Outliers in data

I have a dataset like so -
15643, 14087, 12020, 8402, 7875, 3250, 2688, 2654, 2501, 2482, 1246, 1214, 1171, 1165, 1048, 897, 849, 579, 382, 285, 222, 168, 115, 92, 71, 57, 56, 51, 47, 43, 40, 31, 29, 29, 29, 29, 28, 22, 20, 19, 18, 18, 17, 15, 14, 14, 12, 12, 11, 11, 10, 9, 9, 8, 8, 8, 8, 7, 6, 5, 5, 5, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
Based on domain knowledge, I know that larger values are the only ones we want to include in our analysis. How do I determine where to cut off our analysis? Should it be don't include 15 and lower or 50 and lower etc?
You can do a distribution check with quantile function. Then you can remove values below lowest 1 percentile or 2 percentile. Following is an example:
import numpy as np
data = np.array(data)
print(np.quantile(data, (.01, .02)))
Another method is calculating the inter quartile range (IQR) and setting lowest bar for analysis is Q1-1.5*IQR
Q1, Q3 = np.quantile(data, (0.25, 0.75))
data_floor = Q1 - 1.5 * (Q3 - Q1)

Increasing the label size in matplotlib in pie chart

I have the following dictionary
{'Electronic Arts': 66,
'GT Interactive': 1,
'Palcom': 1,
'Fox Interactive': 1,
'LucasArts': 5,
'Bethesda Softworks': 9,
'SquareSoft': 3,
'Nintendo': 142,
'Virgin Interactive': 4,
'Atari': 7,
'Ubisoft': 28,
'Konami Digital Entertainment': 11,
'Hasbro Interactive': 1,
'MTV Games': 1,
'Sega': 11,
'Enix Corporation': 4,
'Capcom': 13,
'Warner Bros. Interactive Entertainment': 7,
'Acclaim Entertainment': 1,
'Universal Interactive': 1,
'Namco Bandai Games': 7,
'Eidos Interactive': 9,
'THQ': 7,
'RedOctane': 1,
'Sony Computer Entertainment Europe': 3,
'Take-Two Interactive': 24,
'Square Enix': 5,
'Microsoft Game Studios': 22,
'Disney Interactive Studios': 2,
'Vivendi Games': 2,
'Sony Computer Entertainment': 52,
'Activision': 45,
'505 Games': 4}
Now the problem I am facing is viewing the labels. The labels are extremely small and invisible.
Please anyone can suggest on how to increase the label size.
I have tried the below code:
plt.figure(figsize=(80,80))
plt.pie(vg_dict.values(),labels=vg_dict.keys())
plt.show()
Adding textprops argument in plt.pie method:
plt.figure(figsize=(80,80))
plt.pie(vg_dict.values(), labels=vg_dict.keys(), textprops={'fontsize': 30})
plt.show()
You can check all the properties of Text object here.
Updated
I don't know if your labels order matter? To avoid overlapping labels, you can try to modify your start angle (plt start drawing pie counterclockwise from the x-axis), and re-order the "crowded" labels:
vg_dict = {
'Palcom': 1,
'Electronic Arts': 66,
'GT Interactive': 1,
'LucasArts': 5,
'Bethesda Softworks': 9,
'SquareSoft': 3,
'Nintendo': 142,
'Virgin Interactive': 4,
'Atari': 7,
'Ubisoft': 28,
'Hasbro Interactive': 1,
'Konami Digital Entertainment': 11,
'MTV Games': 1,
'Sega': 11,
'Enix Corporation': 4,
'Capcom': 13,
'Acclaim Entertainment': 1,
'Warner Bros. Interactive Entertainment': 7,
'Universal Interactive': 1,
'Namco Bandai Games': 7,
'Eidos Interactive': 9,
'THQ': 7,
'RedOctane': 1,
'Sony Computer Entertainment Europe': 3,
'Take-Two Interactive': 24,
'Vivendi Games': 2,
'Square Enix': 5,
'Microsoft Game Studios': 22,
'Disney Interactive Studios': 2,
'Sony Computer Entertainment': 52,
'Fox Interactive': 1,
'Activision': 45,
'505 Games': 4}
plt.figure(figsize=(80,80))
plt.pie(vg_dict.values(), labels=vg_dict.keys(), textprops={'fontsize': 35}, startangle=-35)
plt.show()
Result:

MultiPoint crossover using Numpy

I am trying to do crossover on a Genetic Algorithm population using numpy.
I have sliced the population using parent 1 and parent 2.
population = np.random.randint(2, size=(4,8))
p1 = population[::2]
p2 = population[1::2]
But I am not able to figure out any lambda or numpy command to do a multi-point crossover over parents.
The concept is to take ith row of p1 and randomly swap some bits with ith row of p2.
I think you want to select from p1 and p2 at random, cell by cell.
To make it easier to understand i've changed p1 to be 10 to 15 and p2 to be 20 to 25. p1 and p2 were generated at random in these ranges.
p1
Out[66]:
array([[15, 15, 13, 14, 12, 13, 12, 12],
[14, 11, 11, 10, 12, 12, 10, 12],
[12, 11, 14, 15, 14, 10, 13, 10],
[11, 12, 10, 13, 14, 13, 12, 13]])
In [67]: p2
Out[67]:
array([[23, 25, 24, 21, 24, 20, 24, 25],
[21, 21, 20, 20, 25, 22, 24, 22],
[24, 22, 25, 20, 21, 22, 21, 22],
[22, 20, 21, 22, 25, 23, 22, 21]])
In [68]: sieve=np.random.randint(2, size=(4,8))
In [69]: sieve
Out[69]:
array([[0, 1, 0, 1, 1, 0, 1, 0],
[1, 1, 1, 0, 0, 1, 1, 1],
[0, 1, 1, 0, 0, 1, 1, 0],
[0, 0, 0, 1, 1, 1, 1, 1]])
In [70]: not_sieve=sieve^1 # Complement of sieve
In [71]: pn = p1*sieve + p2*not_sieve
In [72]: pn
Out[72]:
array([[23, 15, 24, 14, 12, 20, 12, 25],
[14, 11, 11, 20, 25, 12, 10, 12],
[24, 11, 14, 20, 21, 10, 13, 22],
[22, 20, 21, 13, 14, 13, 12, 13]])
The numbers in the teens come from p1 when sieve is 1
The numbers in the twenties come from p2 when sieve is 0
This may be able to be made more efficient but is this what you expect as output?

a MCMC code in winbugs

i have a problem with winbugs code. i start to learn it recently and now i want to write a code for predicting some variable with mcmc method. the equation is:
R=a1*U + a2*B + a3*D^a4 + a5*S^a6 + a7. i have all valuse of R,U,D,B,S and a1 to a7 are random variable and i want to estimate them. this is a code that i write and i know thats not true. can anyone help me to correct it?
model
{
for(i in 1 : N) {
R [i] <- a1 * U [i] + a2 * B [i] + a3 * pow(D [i] , a4) + a5 * pow( S [i] , a6) + a7
}
# priors:
a1 ~ dnorm(0.0,1.0E-4)
a2 ~ dnorm(0.0,1.0E-4)
a3 ~ dnorm(0.0,1.0E-4);
a4 ~ dnorm(0.0,1.0E-4)
a5 ~ dnorm(0.0,1.0E-4)
a6 ~ dnorm(0.0,1.0E-4)
a7 ~ dnorm(0.0,1.0E-4)
}
list(N=120, R = c( 2.19, 2.12, 1.88, 2.81, 2.2, 2.37, 2.34, 2.9, 3.04, 3.07, 3.04, 2.95, 2.66, 2.3, 2.17, 2.87, 2.48, 2.02, 1.87, 2, 2.45, 2.18, 2.17, 2.09, 2.1, 2.05, 2.6, 2.46, 2.2, 2.14, 2.78, 2.03, 2.05, 2.47, 2.05, 2.47, 1.83, 2.08, 1.87, 2.1, 2.31, 2.43, 1.79, 2.17, 2.17, 1.65, 1.75, 1.87, 1.78, 1.78, 2.39, 1.28, 1.85, 2.2, 2.43, 2.13, 1.71, 2.04, 2.39, 2.14, 1.88, 1.88, 1.27, 2.14, 1.77, 2.14, 1.6, 2.05, 2.44, 2.42, 1.67, 2.12, 2.17, 2.28, 2.12, 2.47, 2, 1.5, 2.47, 2.35, 2.07, 2.28, 2.16, 2.61, 1.91, 2.26, 1.85, 1.58, 1.51, 2.36, 1.82, 1.5, 2.05, 1.84, 1.38, 1.96, 1.85, 1.95, 1.6, 1.88, 1.88, 2.11, 1.64, 1.51, 2.17, 2.16, 1.98, 2.09, 1.77, 2, 1.84, 2.15, 1.7, 1.46, 2.23, 1.87, 1.82, 2.64, 1.94, 1.97
),
U = c( 199.7, 199.3, 199, 189.7, 189, 168.3, 174.1, 177.9, 180.7, 184.1, 192.4, 191, 191, 194.5, 195.2, 188.3, 188.4, 193.1, 182.8, 182.4, 182.4, 192.7, 192.6, 164.1, 160.7, 160.7, 144.8, 140, 137.2, 136.6, 135.2, 136.2, 137, 137.4, 137, 137.4, 139.3, 139.4, 141, 140.7, 139.7, 138.9, 136.5, 134.2, 131.4, 130, 128.6, 130, 131, 131, 129, 130, 130.1, 130.2, 138.6, 138.8, 138.8, 138.7, 138.6, 137.4, 137.2, 136.2, 135.2, 134.8, 134.1, 133.3, 133.4, 134.1, 134.3, 134.3, 134.3, 133.9, 133.8, 143.4, 145.4, 146.2, 145.9, 137.5, 140.7, 145.5, 158.6, 159, 159.3, 159.3, 160.3, 159.6, 153.8, 154.5, 157.9, 160.3, 163.2, 164.3, 166.4, 173.1, 173.1, 174.1, 174.5, 175.2, 176, 176.8, 176.8, 177, 175.9, 175.2, 173.1, 156.9, 150.7, 147.6, 147, 145.6, 144.8, 140.7, 125.5, 123.1, 122.3, 121.7, 121.7, 121.7, 120.7, 119.7
),
B= c( 55, 55, 55, 56, 56, 58, 58, 58, 57, 57, 54, 54, 53, 52, 52, 46, 45, 43, 39, 39, 39, 42, 42, 46, 46, 46, 42, 43, 42, 41, 41, 40, 39, 39, 39, 39, 38, 38, 38, 38, 37, 37, 35, 34, 34, 34, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 37, 36, 36, 36, 35, 36, 35, 33, 33, 32, 32, 32, 30, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 29, 29, 29, 29, 30, 30, 29, 29
),
D = c( 0.8, 1.6, 2, 0.2, 2, 1.6, 2, 0.4, 0.2, 0.4, 0.4, 0.4, 0.1, 0.4, 0.4, 0.2, 0.2, 0.8, 0.8, 0.8, 0.4, 0.4, 0.2, 0.8, 1.6, 1.6, 0.2, 0.1, 1.6, 0.8, 0.4, 0.4, 1.6, 0.8, 1.6, 0.8, 2, 2, 1.6, 1.6, 1.6, 0.8, 1.6, 2, 2, 1.6, 1.6, 1.6, 0.4, 0.4, 0.8, 0.8, 0.8, 0.8, 0.4, 0.8, 0.4, 0.8, 0.2, 1.6, 0.8, 0.4, 1.6, 0.4, 1.6, 0.8, 1.6, 0.8, 0.4, 0.4, 1.6, 0.4, 0.4, 1.6, 1.6, 0.4, 0.8, 1.6, 0.1, 0.8, 0.1, 0.2, 0.8, 0.8, 0.8, 0.4, 0.8, 2, 1.6, 0.1, 0.8, 0.4, 0.4, 0.8, 0.8, 0.8, 0.1, 0.4, 2, 1.6, 1.6, 0.4, 2, 2, 1.6, 0.8, 1.6, 2, 2, 1.6, 2, 1.6, 2, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.2
),
S = c( 25, 21, 20, 42, 40, 41, 35, 61, 55, 49, 32, 34, 23, 33, 30, 29, 24, 41, 20, 66, 55, 55, 59, 19, 66, 31, 67, 46, 70, 62, 29, 32, 21, 46, 21, 46, 41, 40, 10, 28, 36, 47, 14, 41, 44, 15, 10, 15, 2, 11, 39, 2, 26, 32, 58, 55, 8, 17, 50, 69, 67, 45, 7, 62, 76, 47, 18, 19, 30, 40, 4, 81, 60, 33, 71, 32, 61, 10, 15, 52, 36, 74, 35, 78, 26, 20, 52, 52, 10, 70, 21, 4, 74, 33, 11, 37, 4, 56, 71, 88, 77, 89, 66, 79, 63, 51, 81, 44, 86, 72, 70, 68, 70, 10, 57, 36, 56, 80, 68, 60
))
list(a7 = 2, a1= 0, a2 = 0, a3 = 0,
a4 = 0, a5= 0, a6 = 0)
a1, a2, a3, a4, a5, a6, a7
This looks like you're doing regression where you'd do:
Where is your function of a1, a2, etc. However, your model implies:
The big difference is that there is no error or uncertainty in your model. You can change that by doing:
model{
for(i in 1:N) {
mu[i] <- a1 * U [i] + a2 * B [i] + a3 * pow(D [i] , a4) + a5 * pow( S [i] , a6) + a7
R [i] ~ dnorm(mu[i], prec)
}
# priors:
prec ~ dgamma(.001, .001)
a1 ~ dnorm(0.0,1.0E-4)
a2 ~ dnorm(0.0,1.0E-4)
a3 ~ dnorm(0.0,1.0E-4)
a4 ~ dnorm(0.0,1.0E-4)
a5 ~ dnorm(0.0,1.0E-4)
a6 ~ dnorm(0.0,1.0E-4)
a7 ~ dnorm(0.0,1.0E-4)
}
Note that this also introduces a new parameter, the standard deviation of the distribution, modelled here as precision.