awk distance between records - awk

Hey I'm trying to find the distance between records in a text file. I'm trying to do it using awk.
An example input is:
1 2 1 4 yes
2 3 2 2 no
1 1 1 5 yes
4 2 4 0 no
5 1 0 1 no
I want to find the distance between each of the numerical values. I'm doing this by subtracting the values and then squaring the answer. I have tried the following code below but all the distances are simply 0. Any help would be appreciated.
BEGIN {recs = 0; fieldnum = 5;}
{
recs++;
for(i=1;i<=NF;i++) {data[recs,i] = $i;}
}
END {
for(r=1;r<=recs;r++) {
for(f=1;f<fieldnum;f++) {
##find distances
for(t=1;t<=recs;t++) {
distance[r,t]+=((data[r,f] - data[t,f])*(data[r,f] - data[t,f]));
}
}
}
for(r=1;r<=recs;r++) {
for(t=1;t<recs;t++) {
##print distances
printf("distance between %d and %d is %d \n",r,t,distance[r,t]);
}
}
}

No idea what you mean conceptually by the "distance between each of the numerical values" so I can't help you with your algorithm but let's clean up the code to see what that looks like:
$ cat tst.awk
{
for(i=1;i<=NF;i++) {
data[NR,i] = $i
}
}
END {
for(r=1;r<=NR;r++) {
for(f=1;f<NF;f++) {
##find distances
for(t=1;t<=NR;t++) {
delta = data[r,f] - data[t,f]
distance[r,t]+=(delta * delta)
}
}
}
for(r=1;r<=NR;r++) {
for(t=1;t<NR;t++) {
##print distances
printf "distance between %d and %d is %d\n",r,t,distance[r,t]
}
}
}
$
$ awk -f tst.awk file
distance between 1 and 1 is 0
distance between 1 and 2 is 7
distance between 1 and 3 is 2
distance between 1 and 4 is 34
distance between 2 and 1 is 7
distance between 2 and 2 is 0
distance between 2 and 3 is 15
distance between 2 and 4 is 13
distance between 3 and 1 is 2
distance between 3 and 2 is 15
distance between 3 and 3 is 0
distance between 3 and 4 is 44
distance between 4 and 1 is 34
distance between 4 and 2 is 13
distance between 4 and 3 is 44
distance between 4 and 4 is 0
distance between 5 and 1 is 27
distance between 5 and 2 is 18
distance between 5 and 3 is 33
distance between 5 and 4 is 19
Seems to produce some non-zero output....

Related

dynamic programming, tsp problem, 9 cities out of 15

I think I have some kind of TSP problem. I have matrix of distances between 15 cities:
A B C D E F G H I J K L M N O
A 0 3 8 7 8 9 4 4 2 9 5 5 7 9 9
B 9 0 6 3 8 9 3 9 5 3 3 4 8 6 8
C 1 7 0 8 3 5 4 3 1 1 7 8 2 4 3
D 1 9 7 0 4 3 5 6 8 4 3 4 2 8 9
E 5 8 3 5 0 9 7 4 9 4 5 7 4 6 2
F 5 7 9 6 2 0 3 5 3 6 6 7 4 9 2
G 3 2 8 1 1 8 0 3 4 5 2 4 7 2 6
H 1 4 7 5 5 3 8 0 1 1 7 6 5 8 1
I 5 5 6 5 5 6 6 4 0 2 1 3 4 9 5
J 4 5 4 1 3 9 2 7 9 0 6 8 1 9 9
K 3 4 6 5 9 4 9 5 2 5 0 5 1 4 2
L 8 9 5 2 6 2 9 9 4 5 5 0 3 1 5
M 5 9 7 1 5 5 5 4 6 2 1 6 0 9 2
N 9 5 7 5 7 8 6 5 2 7 1 2 9 0 1
O 7 6 9 6 9 8 4 5 6 2 9 7 7 7 0
Distance from A to B is not the same as distance from B to A.
letter in row means city from
letter in column means city to
Example:
distance from A to F is 9
distance from F to A is 5
I have to start and end in city A. I have to travel to 9 different cities, I cant visit same city twice. Travelled distance should be minimalised. I am familiar with TSP algorithm but i am not certain how to do it only for 9 cities. It should be possible to solve this by using tsp algorithm only once. Thanks for help.
Eventually i figured it out:
// Dynamic Programming based Java program to find shortest path with
// exactly k edges
import java.util.*;
import java.lang.*;
import java.io.*;
class knapsack{
// Define number of vertices in the graph and inifinite value
static final int V = 15;
static final int INF = Integer.MAX_VALUE;
static int numberofedges=10;
static int[][][] S=new int[15][15][15];
// A Dynamic programming based function to find the shortest path
// from u to v with exactly k edges.
static <bolean> int shortestPath(int graph[][], int u, int v, int k)
{ for(int y=0;y<15;y++){
for(int x=0;x<15;x++){
for(int z=0;z<9;z++){
S[x][y][z]=-1;
}}}
// Table to be filled up using DP. The value sp[i][j][e] will
// store weight of the shortest path from i to j with exactly
// k edges
int sp[][][] = new int[V][V][k+1];
System.out.println(Arrays.toString(S));
// Loop for number of edges from 0 to k
for (int e = 0; e <= k; e++)
{
for (int i = 0; i < V; i++) // for source
{
for (int j = 0; j < V; j++) // for destination
{
sp[i][j][e] = INF;
boolean gofind=true;
for (int x = 1; x <= e; x++) {
if (S[i][j][x] == j) {
System.out.println(S[i][j][x]);
System.out.println("TU");
gofind = false;
break;
}
}
if(gofind){
// initialize value
// from base cases
if (e == 0 && i == j) {
S[i][j][0]=0;
sp[i][j][e] = 0;
}
if (e == 1 && graph[i][j] != INF) {
S[i][j][e]=j;
sp[i][j][e] = graph[i][j];
}
// go to adjacent only when number of edges is
// more than 1
if (e > 1)
{int help=225;
for (int a = 0; a < V; a++)
{
// There should be an edge from i to a and
// a should not be same as either i or j
if (graph[i][a] != INF && i != a &&j!= a && sp[a][j][e-1] != INF)
{if(sp[i][j][e]>graph[i][a] + sp[a][j][e-1]){
help=a;
}
sp[i][j][e] = Math.min(sp[i][j][e],graph[i][a] + sp[a][j][e-1]);
if(help>16)
S[i][j][e]=j;
else
S[i][j][e]=a;
}}
}
}}
}
}
return sp[u][v][k];
}
public static void main (String[] args)
{
try {
Scanner sc = null;
sc = new Scanner(new BufferedReader(new FileReader("src/ADS2021_cvicenie5data.txt")));
/* Let us create the graph shown in above diagram*/
int[][] graph = new int[15][15];
sc.nextLine();
while(sc.hasNextLine()) {
for (int i=0; i<graph.length; i++) {
String[] line = sc.nextLine().trim().split(" ");
for (int j=0; j<line.length; j++) {
graph[i][j] = Integer.parseInt(line[j]);
}
}
}
System.out.println(Arrays.deepToString(graph));
System.out.println("Weight of the shortest path is "+ shortestPath(graph, 0, 0, numberofedges));
System.out.println(Arrays.toString(S));
}
catch (
FileNotFoundException e) {
e.printStackTrace();
}
}
}

Using awk to count number of row group

I have a data set: (file.txt)
X Y
1 a
2 b
3 c
10 d
11 e
12 f
15 g
20 h
25 i
30 j
35 k
40 l
41 m
42 n
43 o
46 p
I want to add two columns which are Up10 and Down10,
Up10: From (X) to (X-10) count of row.
Down10 : From (X) to (X+10)
count of row
For example:
X Y Up10 Down10
35 k 3 5
For Up10; 35-10 X=35 X=30 X=25 Total = 3 row
For Down10; 35+10 X=35 X=40 X=41 X=42 X=42 Total = 5 row
Desired Output:
X Y Up10 Down10
1 a 1 5
2 b 2 5
3 c 3 4
10 d 4 5
11 e 5 4
12 f 5 3
15 g 4 3
20 h 5 3
25 i 3 3
30 j 3 3
35 k 3 5
40 l 3 5
41 m 3 4
42 n 4 3
43 o 5 2
46 p 5 1
This is the Pierre François' solution: Thanks again #Pierre François
awk '
BEGIN{OFS="\t"; print "X\tY\tUp10\tDown10"}
(NR == FNR) && (FNR > 1){a[$1] = $1 + 0}
(NR > FNR) && (FNR > 1){
up = 0; upl = $1 - 10
down = 0; downl = $1 + 10
for (i in a) { i += 0 # tricky: convert i to integer
if ((i >= upl) && (i <= $1)) {up++}
if ((i >= $1) && (i <= downl)) {down++}
}
print $1, $2, up, down;
}
' file.txt file.txt > file-2.txt
But when i use this command for 13GB data, it takes too long.
I have used this way for 13GB data again:
awk 'BEGIN{ FS=OFS="\t" }
NR==FNR{a[NR]=$1;next} {x=y=FNR;while(--x in a&&$1-10<a[x]){} while(++y in a&&$1+10>a[y]){} print $0,FNR-x,y-FNR}
' file.txt file.txt > file-2.txt
When file-2.txt reaches 1.1GB it is frozen. I am waiting several hours, but i can not see finish of command and final output file.
Note: I am working on Gogole cloud. Machine type
e2-highmem-8 (8 vCPUs, 64 GB memory)
A single pass awk that keeps the sliding window of 10 last records and uses that to count the ups and downs. For symmetricy's sake there should be deletes in the END but I guess a few extra array elements in memory isn't gonna make a difference:
$ awk '
BEGIN {
FS=OFS="\t"
}
NR==1 {
print $1,$2,"Up10","Down10"
}
NR>1 {
a[NR]=$1
b[NR]=$2
for(i=NR-9;i<=NR;i++) {
if(a[i]>=a[NR]-10&&i>=2)
up[NR]++
if(a[i]<=a[NR-9]+10&&i>=2)
down[NR-9]++
}
}
NR>10 {
print a[NR-9],b[NR-9],up[NR-9],down[NR-9]
delete a[NR-9]
delete b[NR-9]
delete up[NR-9]
delete down[NR-9]
}
END {
for(nr=NR+1;nr<=NR+9;nr++) {
for(i=nr-9;i<=nr;i++)
if(a[i]<=a[nr-9]+10&&i>=2&&i<=NR)
down[nr-9]++
print a[nr-9],b[nr-9],up[nr-9],down[nr-9]
}
}' file
Output:
X Y Up10 Down10
1 a 1 5
2 b 2 5
...
35 k 3 5
...
43 o 5 2
46 p 5 1
Another single pass approach with a sliding window
awk '
NR == 1 { next } # skip the header
NR == 2 { min = max = cur = 1; X[cur] = $1; Y[cur] = $2; next }
{ X[++max] = $1; Y[max] = $2
if (X[cur] >= $1 - 10) next
for (; X[cur] + 10 < X[max]; ++cur) {
for (; X[min] < X[cur] - 10; ++min) {
delete X[min]
delete Y[min]
}
print X[cur], Y[cur], cur - min + 1, max - cur
}
}
END {
for (; cur <= max; ++cur) {
for (; X[min] < X[cur] - 10; ++min);
for (i = max; i > cur && X[cur] + 10 < X[i]; --i);
print X[cur], Y[cur], cur - min + 1, i - cur + 1
}
}
' file
The script assumes the X column is ordered numerically.

Create bins with totals and percentage

I would like to create bins to get histogram with totals and percentage, e.g. starting from 0.
If possible to set the minimum and maximum value in the bins ( in my case value min=0 and max=20 )
Input file
8 5
10 1
11 4
12 4
12 4
13 5
16 7
18 9
16 9
17 7
18 5
19 5
20 1
21 7
output desired
0 0 0.0%
0 - 2 0 0.0%
2 - 4 0 0.0%
4 - 6 0 0.0%
6 - 8 0 0.0%
8 - 10 5 6.8%
10 - 12 5 6.8%
12 - 14 13 17.8%
14 - 16 0 0.0%
16 - 18 23 31.5%
18 - 20 19 26.0%
> 20 8 11.0%
---------------------
Total: 73
I use this code from Mr Ed Morton, it works perfectly but the percentage is missed.
awk 'BEGIN { delta = (delta == "" ? 2 : delta) }
{
bucketNr = int(($0+delta) / delta)
cnt[bucketNr]++
numBuckets = (numBuckets > bucketNr ? numBuckets : bucketNr)
}
END {
for (bucketNr=1; bucketNr<=numBuckets; bucketNr++) {
end = beg + delta
printf "%0.1f %0.1f %d\n", beg, end, cnt[bucketNr]
beg = end
}
}' file
Thanks in advance
Your expected output doesn't seem to correspond to your sample input data, but try this variation of that awk code in your question (Intended to be put in an executable file to run as a script, not a a one-liner due to size):
#!/usr/bin/awk -f
BEGIN { delta = (delta == "" ? 2 : delta) }
{
bucketNr = int(($0+delta) / delta)
cnt[bucketNr]++
max[bucketNr] = max[bucketNr] < $2 ? $2 : max[bucketNr]
sum += $2
numBuckets = (numBuckets > bucketNr ? numBuckets : bucketNr)
}
END {
for (bucketNr=1; bucketNr<=numBuckets; bucketNr++) {
end = beg + delta
printf "%d-%d %d %.1f\n", beg, end, max[bucketNr],
(cnt[bucketNr] / NR) * 100
beg = end
}
print "-------------"
print "Total " sum
}
It adds tracking the maximum of the second column for each bin the first column falls in, and prints out a percentage instead of a count of how many rows were in each bin. Plus some tweaks to the output format to better match your desired output.

Awk code with associative arrays -- array doesn't seem populated, but no error

Question: Why does it seem that date_list[d] and isin_list[i] are not getting populated, in the code segment below?
AWK Code (on GNU-AWK on a Win-7 machine)
BEGIN { FS = "," } # This SEBI data set has comma-separated fields (NSE snapshots are pipe-separated)
# UPDATE the lists for DATE ($10), firm_ISIN ($9), EXCHANGE ($12), and FII_ID ($5).
( $17~/_EQ\>/ ) {
if (date[$10]++ == 0) date_list[d++] = $10; # Dates appear in order in raw data
if (isin[$9]++ == 0) isin_list[i++] = $9; # ISINs appear out of order in raw data
print $10, date[$10], $9, isin[$9], date_list[d], d, isin_list[i], i
}
input data
49290,C198962542782200306,6/30/2003,433581,F5811773991200306,S5405611832200306,B5086397478200306,NESTLE INDIA LTD.,INE239A01016,6/27/2003,1,E9035083824200306,REG_DL_STLD_02,591.13,5655,3342840.15,REG_DL_INSTR_EQ,REG_DL_DLAY_P,DL_RPT_TYPE_N,DL_AMDMNT_DEL_00
49291,C198962542782200306,6/30/2003,433563,F6292896459200306,S6344227311200306,B6110521493200306,GRASIM INDUSTRIES LTD.,INE047A01013,6/27/2003,1,E9035083824200306,REG_DL_STLD_02,495.33,3700,1832721,REG_DL_INSTR_EQ,REG_DL_DLAY_P,DL_RPT_TYPE_N,DL_AMDMNT_DEL_00
49292,C198962542782200306,6/30/2003,433681,F6513202607200306,S1724027402200306,B6372023178200306,HDFC BANK LTD,INE040A01018,6/26/2003,1,E745964372424200306,REG_DL_STLD_02,242,2600,629200,REG_DL_INSTR_EQ,REG_DL_DLAY_D,DL_RPT_TYPE_N,DL_AMDMNT_DEL_00
49293,C7885768925200306,6/30/2003,48128,F4406661052200306,S7376401565200306,B4576522576200306,Maruti Udyog Limited,INE585B01010,6/28/2003,3,E912851176274200306,REG_DL_STLD_04,125,44600,5575000,REG_DL_INSTR_EQ,REG_DL_DLAY_P,DL_RPT_TYPE_N,DL_AMDMNT_DEL_00
49294,C7885768925200306,6/30/2003,48129,F4500260787200306,S1312094035200306,B4576522576200306,Maruti Udyog Limited,INE585B01010,6/28/2003,4,E912851176274200306,REG_DL_STLD_04,125,445600,55700000,REG_DL_INSTR_EQ,REG_DL_DLAY_P,DL_RPT_TYPE_N,DL_AMDMNT_DEL_00
49295,C7885768925200306,6/30/2003,48130,F6425024637200306,S2872499118200306,B4576522576200306,Maruti Udyog Limited,INE585B01010,6/28/2003,3,E912851176274200306,REG_DL_STLD_04,125,48000,6000000,REG_DL_INSTR_EU,REG_DL_DLAY_P,DL_RPT_TYPE_N,DL_AMDMNT_DEL_00
output that I am getting
6/27/2003 1 INE239A01016 1 1 1
6/27/2003 2 INE047A01013 1 1 2
6/26/2003 1 INE040A01018 1 2 3
6/28/2003 1 INE585B01010 1 3 4
6/28/2003 2 INE585B01010 2 3 4
Expected output
As far as I can tell, the print is printing out correctly (i) $10 (the date) (ii) date[$10), the count for each date (iii) $9 (firm-ID called ISIN) (iv) isin[$9], the count for each ISIN (v) d (index of date_list, the number of unique dates) and (vi) i (index of isin_list, the number of unique ISINs). I should also get two more columns -- columns 5 and 7 below -- for date_list[d] and isin_list[i], which will have values that look like $10 and $9.
6/27/2003 1 INE239A01016 1 6/27/2003 1 INE239A01016 1
6/27/2003 2 INE047A01013 1 6/27/2003 1 INE047A01013 2
6/26/2003 1 INE040A01018 1 6/26/2003 2 INE040A01018 3
6/28/2003 1 INE585B01010 1 6/28/2003 3 INE585B01010 4
6/28/2003 2 INE585B01010 2 6/28/2003 3 INE585B01010 4
actual code I now use is
{ if (date[$10]++ == 0) date_list[d++] = $10;
if (isin[$9]++ == 0) isin_list[i++] = $9;}
( $11~/1|2|3|5|9|1[24]/ )) { ++BNR[$10,$9,$12,$5]}
END { { for (u = 0; u < d; u++)
{for (v = 0; v < i; v++)
{ if (BNR[date_list[u],isin_list[v]]>0)
BR=BNR[date_list[u],isin_list[v]]
{ print(date_list[u], isin_list[v], BR}}}}}
Thanks a lot to everyone.

Sub-group finding

I have this kind of records (rows):
0 1 4 8 2 3 7 9 3 4 8 9 4 7 9 1 0 0 2 5 8 2 4 5 6 1 0 2 4 8 9 0
Definitions:
group: collection of numbers which are separated by 0-s (zeros)
sub-group: collection of numbers which are separated by local minima in the groups
local minimum: the numbers before and after it are greater
In the above example there are 3 groups and 7 sub-groups, i.e.
groups: 1 4 8 2 3 7 9 3 4 8 9 4 7 9 1 , 2 5 8 2 4 5 6 1 , 2 4 8 9
sub-groups: 1 4 8 , 3 7 9 , 4 8 9 , 7 9 1 , 2 5 8 , 4 5 6 1 , 2 4 8 9 (this last is identical to the group itself)
So, in these kind of records I have to
find the minima (print out: 2, 3, 4, 2)
the size (number of character) of these sub-groups
positions of numbers of the sub-groups in the groups
I have already started to write something, but I am stuck here...
Can anyone help me to solve this?
Here is the code so far:
#!/usr/bin/awk -f
{
db = split($0,a,/( 0)+ */)
for (i=1; i<=db; i++) {
split_at_max(a[i])
for (j=1; j<=ret_count; j++) {
print ""
for (k=1; k<=maximums[j]; k++) {
print ret[j,k]
}
}
}
}
function split_at_max(x) {
m_db = split(x,values," ")
for (mx in ret) {
delete ret[mx]
}
ret_count = 1
ret_curr_db = 0
for (mi=2; mi<m_db; mi++) {
ret_curr_db++
ret[ret_count,ret_curr_db] = values[mi-1]
if ( (values[mi-1] <= values[mi]) &&
(values[mi] >= values[mi+1]) &&
(values[mi+1] <= values[mi+2]) ) {
maximums[ret_count] = ret_curr_db
ret_count++
ret_curr_db = 0
}
}
ret_curr_db++
ret[ret_count,ret_curr_db] = values[mi-1]
ret_curr_db++
ret[ret_count,ret_curr_db] = values[mi]
maximums[ret_count] = ret_curr_db
}
interesting assignment.
wrote a quick and dirty awk script. there should be a lot of room to optimize. I don't know what kind of output are you expecting...
awk -v RS="0" 'NF>1{
delete g;
print "group:";
for(i=1;i<=NF;i++){
printf $i" ";
g[i]=$i
}
print "";
t=1;
delete m;
for(i=2;i<length(g);i++){
if(g[i-1]>g[i] && g[i]<g[i+1]) {
print "found minima:"g[i]
m[t]=i;
t++;
}
}
if(length(m)>0){
s=0;
for(x=1;x<=length(m);x++){
printf "sub-group: "
for(i=s+1;i<m[x];i++){
printf g[i]" "
s=m[x];
}
print "";
if(x+1>length(m)){
printf "sub-group: ";
for(i=s+1;i<=length(g);i++)
printf g[i]" "
print "";
}
}
}else{
print "no minima found. sub-group is the same as group:"
printf "sub-group: "
for(i=1;i<=NF;i++){
printf $i" ";
g[i]=$i
}
}
print "\n-----------------------------"
} yourFile
the output on your example input:
group:
1 4 8 2 3 7 9 3 4 8 9 4 7 9 1
found minima:2
found minima:3
found minima:4
sub-group: 1 4 8
sub-group: 3 7 9
sub-group: 4 8 9
sub-group: 7 9 1
-----------------------------
group:
2 5 8 2 4 5 6 1
found minima:2
sub-group: 2 5 8
sub-group: 4 5 6 1
-----------------------------
group:
2 4 8 9
no minima found. sub-group is the same as group:
sub-group: 2 4 8 9
-----------------------------
update
fixing for those "special" elements like 20,30,40...
still quick and dirty:
change my awk script above to
sed 's/^0$//g' yourFile | awk -v RS="" [following codes are the same as above]......
then the output is:
group:
6 63 81 31 37 44 20
found minima:31
sub-group: 6 63 81
sub-group: 37 44 20
-----------------------------