Big numbers in lambdify - numpy

I have (automatically generated by SymPy) expression, including sqrt function with big numbers under sqrt and small multiplier in front of it. So overall result must be within float range, but value under sqrt - is not. I coonvert this expression to Python function using lambdify command. Call of this function gets exception:
from sympy import *
t = symbols("t")
k = 10
f = 2 * 10 ** (- k) * sqrt(10 ** (2 * k) * t ** 2 + 1)
print(f)
F = lambdify(t, f)
t0 = 10 ** 10
T = np.arange(t0, 2 * t0, t0 / 4)
print(T)
F(T)
Output:
2.0e-10*sqrt(100000000000000000000*t**2 + 1)
[1.00e+10 1.25e+10 1.50e+10 1.75e+10]
AttributeError: 'float' object has no attribute 'sqrt'
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_15740/1035914544.py in <module>
8 T = np.arange(t0, 2 * t0, t0 / 4)
9 print(T)
---> 10 F(T)
<lambdifygenerated-11> in _lambdifygenerated(t)
1 def _lambdifygenerated(t):
----> 2 return 2.0e-10*sqrt(100000000000000000000*t**2 + 1)
TypeError: loop of ufunc does not support argument 0 of type float which has no callable sqrt method
For k = 2 code works properly:
0.02*sqrt(10000*t**2 + 1)
[1.00e+10 1.25e+10 1.50e+10 1.75e+10]
array([2.0e+10, 2.5e+10, 3.0e+10, 3.5e+10])
Is there any way to fix this problem without manually rewriting expression?
UPD: Looks like it is a problem of NumPy:
import numpy as np
k = 10
def F1(t):
return np.sqrt( (10 ** (- k)) ** 2 * 10 ** (2 * k) * t ** 2 + 1)
def F2(t):
return 10 ** (- k) * np.sqrt(10 ** (2 * k) * t ** 2 + 1)
print(F1(10 ** 5))
print(F2(10 ** 5))
First call works, second call - not!

Converting argument of np.sqrt to numpy.double solves problem:
def Sqrt(x):
return np.sqrt(np.double(x))
F = lambdify(t, f, {'sqrt': Sqrt})

Because of the large multiplier, the np.sqrt argument is object dtype array:
In [3]: 100000000000000000000 * T**2
Out[3]: array([1e+40, 1.5625e+40, 2.25e+40, 3.0625e+40], dtype=object)
With object dtype arrays, numpy iterates (at list comprehension speed), applying a 'method' to each element. In effect
1e+40.sqrt() etc
Hence the no method error.
Your fix:
In [3]: np.double(100000000000000000000 * T**2)
Out[3]: array([1.0000e+40, 1.5625e+40, 2.2500e+40, 3.0625e+40])
In [4]: np.sqrt(_)
Out[4]: array([1.00e+20, 1.25e+20, 1.50e+20, 1.75e+20])
or
In [6]: np.sqrt((100000000000000000000 * T**2).astype(float))
Out[6]: array([1.00e+20, 1.25e+20, 1.50e+20, 1.75e+20])

Related

is there an R**2 values for finding in the linear regression analysis?

I'm trying to code a for linear regression analysis that prints TypeError: can't multiply sequence by non-int of type 'list',.
I tried to learn linear regression coefficient analysis
def corr_coef(x,y):
N = len(x)
num = (N * (x * y).sum()) - (x.sum() * y.sum())
den = np.sqrt((N * (x**2).sum() - x.sum()**2) * (N * (y**2).sum() - y.sum()**2))
R = num / den
return R
num = (N * (x * y).sum()) - (x.sum() * y.sum())
TypeError: can't multiply sequence by non-int of type 'list'

what is the difference between s[:] and s if s is a torch.Tensor [duplicate]

import numpy as np
import time
features, labels = d2l.get_data_ch7()
def init_adam_states():
v_w, v_b = torch.zeros((features.shape[1], 1),dtype=torch.float32), torch.zeros(1, dtype=torch.float32)
s_w, s_b = torch.zeros((features.shape[1], 1),dtype=torch.float32), torch.zeros(1, dtype=torch.float32)
return ((v_w, s_w), (v_b, s_b))
def adam(params, states, hyperparams):
beta1, beta2, eps = 0.9, 0.999, 1e-6
for p, (v, s) in zip(params, states):
v[:] = beta1 * v + (1 - beta1) * p.grad.data
s = beta2 * s + (1 - beta2) * p.grad.data**2
v_bias_corr = v / (1 - beta1 ** hyperparams['t'])
s_bias_corr = s / (1 - beta2 ** hyperparams['t'])
p.data -= hyperparams['lr'] * v_bias_corr / (torch.sqrt(s_bias_corr) + eps)
hyperparams['t'] += 1
def train_ch7(optimizer_fn, states, hyperparams, features, labels, batch_size=10, num_epochs=2):
# 初始化模型
net, loss = d2l.linreg, d2l.squared_loss
w = torch.nn.Parameter(torch.tensor(np.random.normal(0, 0.01, size=(features.shape[1], 1)), dtype=torch.float32),
requires_grad=True)
b = torch.nn.Parameter(torch.zeros(1, dtype=torch.float32), requires_grad=True)
def eval_loss():
return loss(net(features, w, b), labels).mean().item()
ls = [eval_loss()]
data_iter = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(features, labels), batch_size, shuffle=True)
for _ in range(num_epochs):
start = time.time()
print(w)
print(b)
for batch_i, (X, y) in enumerate(data_iter):
l = loss(net(X, w, b), y).mean() # 使⽤平均损失
# 梯度清零
if w.grad is not None:
w.grad.data.zero_()
b.grad.data.zero_()
l.backward()
optimizer_fn([w, b], states, hyperparams) # 迭代模型参数
if (batch_i + 1) * batch_size % 100 == 0:
ls.append(eval_loss()) # 每100个样本记录下当前训练误差
# 打印结果和作图
print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start))
d2l.set_figsize()
d2l.plt.plot(np.linspace(0, num_epochs, len(ls)), ls)
d2l.plt.xlabel('epoch')
d2l.plt.ylabel('loss')
train_ch7(adam, init_adam_states(), {'lr': 0.01, 't': 1}, features, labels)
I want to implement the Adam algorithm in the follow code and I feel confused in the function named adam.
v = beta1 * v + (1 - beta1) * p.grad.data
s = beta2 * s + (1 - beta2) * p.grad.data**2
when I use the follow code, the loss function curve is figure 1.
figure 1
v[:] = beta1 * v + (1 - beta1) * p.grad.data
s = beta2 * s + (1 - beta2) * p.grad.data**2
or
v = beta1 * v + (1 - beta1) * p.grad.data
s[:] = beta2 * s + (1 - beta2) * p.grad.data**2
when I use the follow code, the loss function curve is figure 2.
figure 2
v[:] = beta1 * v + (1 - beta1) * p.grad.data
s[:] = beta2 * s + (1 - beta2) * p.grad.data**2
when I use the follow code, the loss function curve is figure 3.
figure 3
The loss function curve in case 3 has always been smoother than that in case 1.
The loss function curve in case 2 sometimes can't converge.
Why is different?
To answer the first question,
v = beta1 * v + (1 - beta1) * p.grad.data
is an out-of-place operation. Remember that python variables are references to objects. By assigning a new value to variable v, the underlying object which v referred to before this assignment will not be changed. Instead the expression beta1 * v + (1 - beta1) * p.grad.data results in a new tensor which is then referred to by v.
On the other hand
v[:] = beta1 * v + (1 - beta1) * p.grad.data
is an in-place operation. After this operation v still refers to the same underlying object, and the elements of that tensor are modified and replaced with the values of the new tensor beta1 * v + (1 - beta1) * p.grad.data.
Take a look at the following 3 lines to see why this matters
for p, (v, s) in zip(params, states):
v[:] = beta1 * v + (1 - beta1) * p.grad.data
s[:] = beta2 * s + (1 - beta2) * p.grad.data**2
v and s are actually referring to tensors which are stored in states. If we do in-place operations then the values in states are changed to reflect the value assigned to v[:] and s[:].
If out-of-place operations are used then the values in states remain unchanged.

Sample without replacement

How to sample without replacement in TensorFlow? Like numpy.random.choice(n, size=k, replace=False) for some very large integer n (e.g. 100k-100M), and smaller k (e.g. 100-10k).
Also, I want it to be efficient and on the GPU, so other solutions like this with tf.py_func are not really an option for me. Anything which would use tf.range(n) or so is also not an option because n could be very large.
This is one way:
n = ...
sample_size = ...
idx = tf.random_shuffle(tf.range(n))[:sample_size]
EDIT:
I had posted the answer below but then read the last line of your post. I don't think there is a good way to do it if you absolutely cannot produce a tensor with size O(n) (numpy.random.choice with replace=False is also implemented as a slice of a permutation). You could resort to a tf.while_loop until you have unique indices:
n = ...
sample_size = ...
idx = tf.zeros(sample_size, dtype=tf.int64)
idx = tf.while_loop(
lambda i: tf.size(idx) == tf.size(tf.unique(idx)),
lambda i: tf.random_uniform(sample_size, maxval=n, dtype=int64))
EDIT 2:
About the average number of iterations in the previous method. If we call n the number of possible values and k the length of the desired vector (with k ≤ n), the probability that an iteration is successful is:
p = product((n - (i - 1) / n) for i in 1 .. k)
Since each iteartion can be considered a Bernoulli trial, the average number of trials unitl first success is 1 / p (proof here). Here is a function that calculates the average numbre of trials in Python for some k and n values:
def avg_iter(k, n):
if k > n or n <= 0 or k < 0:
raise ValueError()
avg_it = 1.0
for p in (float(n) / (n - i) for i in range(k)):
avg_it *= p
return avg_it
And here are some results:
+-------+------+----------+
| n | k | Avg iter |
+-------+------+----------+
| 10 | 5 | 3.3 |
| 100 | 10 | 1.6 |
| 1000 | 10 | 1.1 |
| 1000 | 100 | 167.8 |
| 10000 | 10 | 1.0 |
| 10000 | 100 | 1.6 |
| 10000 | 1000 | 2.9e+22 |
+-------+------+----------+
You can see it varies wildy depending on the parameters.
It is possible, though, to construct a vector in a fixed number of steps, although the only algorithm I can think of is O(k2). In pure Python it goes like this:
import random
def sample_wo_replacement(n, k):
sample = [0] * k
for i in range(k):
sample[i] = random.randint(0, n - 1 - len(sample))
for i, v in reversed(list(enumerate(sample))):
for p in reversed(sample[:i]):
if v >= p:
v += 1
sample[i] = v
return sample
random.seed(100)
print(sample_wo_replacement(10, 5))
# [2, 8, 9, 7, 1]
print(sample_wo_replacement(10, 10))
# [6, 5, 8, 4, 0, 9, 1, 2, 7, 3]
This is a possible way to do it in TensorFlow (not sure if the best one):
import tensorflow as tf
def sample_wo_replacement_tf(n, k):
# First loop
sample = tf.constant([], dtype=tf.int64)
i = 0
sample, _ = tf.while_loop(
lambda sample, i: i < k,
# This is ugly but I did not want to define more functions
lambda sample, i: (tf.concat([sample,
tf.random_uniform([1], maxval=tf.cast(n - tf.shape(sample)[0], tf.int64), dtype=tf.int64)],
axis=0),
i + 1),
[sample, i], shape_invariants=[tf.TensorShape((None,)), tf.TensorShape(())])
# Second loop
def inner_loop(sample, i):
sample_size = tf.shape(sample)[0]
v = sample[i]
j = i - 1
v, _ = tf.while_loop(
lambda v, j: j >= 0,
lambda v, j: (tf.cond(v >= sample[j], lambda: v + 1, lambda: v), j - 1),
[v, j])
return (tf.where(tf.equal(tf.range(sample_size), i), tf.tile([v], (sample_size,)), sample), i - 1)
i = tf.shape(sample)[0] - 1
sample, _ = tf.while_loop(lambda sample, i: i >= 0, inner_loop, [sample, i])
return sample
And an example:
with tf.Graph().as_default(), tf.Session() as sess:
tf.set_random_seed(100)
sample = sample_wo_replacement_tf(10, 5)
for i in range(10):
print(sess.run(sample))
# [3 0 6 8 4]
# [5 4 8 9 3]
# [1 4 0 6 8]
# [8 9 5 6 7]
# [7 5 0 2 4]
# [8 4 5 3 7]
# [0 5 7 4 3]
# [2 0 3 8 6]
# [3 4 8 5 1]
# [5 7 0 2 9]
This is quite intesive on tf.while_loops, though, which are well-known not to be particularly fast in TensorFlow, so I wouldn't know how fast can you really get with this method without some kind of benchmarking.
EDIT 4:
One last possible method. You can divide the range of possible values (0 to n) in "chunks" of size c and pick a random amount of numbers from each chunk, then shuffle everything. The amount of memory that you use is limited by c, and you don't need nested loops. If n is divisible by c, then you should get about a perfect random distribution, otherwise values in the last "short" chunk would receive some extra probability (this may be negligible depending on the case). Here is a NumPy implementation. It is somewhat long to account for different corner cases and pitfalls, but if c ≥ k and n mod c = 0 several parts get simplified.
import numpy as np
def sample_chunked(n, k, chunk=None):
chunk = chunk or n
last_chunk = chunk
parts = n // chunk
# Distribute k among chunks
max_p = min(float(chunk) / k, 1.0)
max_p_last = max_p
if n % chunk != 0:
parts += 1
last_chunk = n % chunk
max_p_last = min(float(last_chunk) / k, 1.0)
p = np.full(parts, 2)
# Iterate until a valid distribution is found
while not np.isclose(np.sum(p), 1) or np.any(p > max_p) or p[-1] > max_p_last:
p = np.random.uniform(size=parts)
p /= np.sum(p)
dist = (k * p).astype(np.int64)
sample_size = np.sum(dist)
# Account for rounding errors
while sample_size < k:
i = np.random.randint(len(dist))
while (dist[i] >= chunk) or (i == parts - 1 and dist[i] >= last_chunk):
i = np.random.randint(len(dist))
dist[i] += 1
sample_size += 1
while sample_size > k:
i = np.random.randint(len(dist))
while dist[i] == 0:
i = np.random.randint(len(dist))
dist[i] -= 1
sample_size -= 1
assert sample_size == k
# Generate sample parts
sample_parts = []
for i, v in enumerate(np.nditer(dist)):
if v <= 0:
continue
c = chunk if i < parts - 1 else last_chunk
base = chunk * i
sample_parts.append(base + np.random.choice(c, v, replace=False))
sample = np.concatenate(sample_parts, axis=0)
np.random.shuffle(sample)
return sample
np.random.seed(100)
print(sample_chunked(15, 5, 4))
# [ 8 9 12 13 3]
A quick benchmark of sample_chunked(100000000, 100000, 100000) takes about 3.1 seconds in my computer, while I haven't been able to run the previous algorithm (sample_wo_replacement function above) to completion with the same parameters. It should be possible to implement it in TensorFlow, maybe using tf.TensorArray, although it would require significant effort to get it exactly right.
use the gumbel-max trick here: https://github.com/tensorflow/tensorflow/issues/9260
z = -tf.log(-tf.log(tf.random_uniform(tf.shape(logits),0,1)))
_, indices = tf.nn.top_k(logits + z,K)
indices are what you want. This tick is so easy~!
The following works fairly fast on the GPU, and I did not encounter memory issues when using n~100M and k~10k (using NVIDIA GeForce GTX 1080 Ti):
def random_choice_without_replacement(n, k):
"""equivalent to 'numpy.random.choice(n, size=k, replace=False)'"""
return tf.math.top_k(tf.random.uniform(shape=[n]), k, sorted=False).indices

Fastest way to create a sparse matrix of the form A.T * diag(b) * A + C?

I'm trying to optimize a piece of code that solves a large sparse nonlinear system using an interior point method. During the update step, this involves computing the Hessian matrix H, the gradient g, then solving for d in H * d = -g to get the new search direction.
The Hessian matrix has a symmetric tridiagonal structure of the form:
A.T * diag(b) * A + C
I've run line_profiler on the particular function in question:
Line # Hits Time Per Hit % Time Line Contents
==================================================
386 def _direction(n, res, M, Hsig, scale_var, grad_lnprior, z, fac):
387
388 # gradient
389 44 1241715 28220.8 3.7 g = 2 * scale_var * res - grad_lnprior + z * np.dot(M.T, 1. / n)
390
391 # hessian
392 44 3103117 70525.4 9.3 N = sparse.diags(1. / n ** 2, 0, format=FMT, dtype=DTYPE)
393 44 18814307 427597.9 56.2 H = - Hsig - z * np.dot(M.T, np.dot(N, M)) # slow!
394
395 # update direction
396 44 10329556 234762.6 30.8 d, fac = my_solver(H, -g, fac)
397
398 44 111 2.5 0.0 return d, fac
Looking at the output it's clear that constructing H is by far the most costly step - it takes considerably longer than actually solving for the new direction.
Hsig and M are both CSC sparse matrices, n is a dense vector and z is a scalar. The solver I'm using requires H to be either a CSC or CSR sparse matrix.
Here's a function that produces some toy data with the same formats, dimensions and sparseness as my real matrices:
import numpy as np
from scipy import sparse
def make_toy_data(nt=200000, nc=10):
d0 = np.random.randn(nc * (nt - 1))
d1 = np.random.randn(nc * (nt - 1))
M = sparse.diags((d0, d1), (0, nc), shape=(nc * (nt - 1), nc * nt),
format='csc', dtype=np.float64)
d0 = np.random.randn(nc * nt)
Hsig = sparse.diags(d0, 0, shape=(nc * nt, nc * nt), format='csc',
dtype=np.float64)
n = np.random.randn(nc * (nt - 1))
z = np.random.randn()
return Hsig, M, n, z
And here's my original approach for constructing H:
def original(Hsig, M, n, z):
N = sparse.diags(1. / n ** 2, 0, format='csc')
H = - Hsig - z * np.dot(M.T, np.dot(N, M)) # slow!
return H
Timing:
%timeit original(Hsig, M, n, z)
# 1 loops, best of 3: 483 ms per loop
Is there a faster way to construct this matrix?
I get close to a 4x speed-up in computing the product M.T * D * M out of the three diagonal arrays. If d0 and d1 are the main and upper diagonal of M, and d is the main diagonal of D, then the following code creates M.T * D * M directly:
def make_tridi_bis(d0, d1, d, nc=10):
d00 = d0*d0*d
d11 = d1*d1*d
d01 = d0*d1*d
len_ = d0.size
data = np.empty((3*len_ + nc,))
indices = np.empty((3*len_ + nc,), dtype=np.int)
# Fill main diagonal
data[:2*nc:2] = d00[:nc]
indices[:2*nc:2] = np.arange(nc)
data[2*nc+1:-2*nc:3] = d00[nc:] + d11[:-nc]
indices[2*nc+1:-2*nc:3] = np.arange(nc, len_)
data[-2*nc+1::2] = d11[-nc:]
indices[-2*nc+1::2] = np.arange(len_, len_ + nc)
# Fill top diagonal
data[1:2*nc:2] = d01[:nc]
indices[1:2*nc:2] = np.arange(nc, 2*nc)
data[2*nc+2:-2*nc:3] = d01[nc:]
indices[2*nc+2:-2*nc:3] = np.arange(2*nc, len_+nc)
# Fill bottom diagonal
data[2*nc:-2*nc:3] = d01[:-nc]
indices[2*nc:-2*nc:3] = np.arange(len_ - nc)
data[-2*nc::2] = d01[-nc:]
indices[-2*nc::2] = np.arange(len_ - nc ,len_)
indptr = np.empty((len_ + nc + 1,), dtype=np.int)
indptr[0] = 0
indptr[1:nc+1] = 2
indptr[nc+1:len_+1] = 3
indptr[-nc:] = 2
np.cumsum(indptr, out=indptr)
return sparse.csr_matrix((data, indices, indptr), shape=(len_+nc, len_+nc))
If your matrix M were in CSR format, you can extract d0 and d1 as d0 = M.data[::2] and d1 = M.data[1::2], I modified you toy data making routine to return those arrays as well, and here's what I get:
In [90]: np.allclose((M.T * sparse.diags(d, 0) * M).A, make_tridi_bis(d0, d1, d).A)
Out[90]: True
In [92]: %timeit make_tridi_bis(d0, d1, d)
10 loops, best of 3: 124 ms per loop
In [93]: %timeit M.T * sparse.diags(d, 0) * M
1 loops, best of 3: 501 ms per loop
The whole purpose of the above code is to take advantage of the structure of the non-zero entries. If you draw a diagram of the matrices you are multiplying together, it is relatively easy to convince yourself that the main (d_0) and top and bottom (d_1) diagonals of the resulting tridiagonal matrix are simply:
d_0 = np.zeros((len_ + nc,))
d_0[:len_] = d00
d_0[-len_:] += d11
d_1 = d01
The rest of the code in that function is simply building the tridiagonal matrix directly, as calling sparse.diags with the above data is several times slower.
I tried running your test case and had problems with the np.dot(N, M). I didn't dig into it, but I think my numpy/sparse combo (both pretty new) had problems using np.dot on sparse arrays.
But H = -Hsig - z*M.T.dot(N.dot(M)) runs just fine. This uses the sparse dot.
I haven't run a profile, but here are Ipython timings for several parts. It takes longer to generate the data than to do that double dot.
In [37]: timeit Hsig,M,n,z=make_toy_data()
1 loops, best of 3: 2 s per loop
In [38]: timeit N = sparse.diags(1. / n ** 2, 0, format='csc')
1 loops, best of 3: 377 ms per loop
In [39]: timeit H = -Hsig - z*M.T.dot(N.dot(M))
1 loops, best of 3: 1.55 s per loop
H is a
<2000000x2000000 sparse matrix of type '<type 'numpy.float64'>'
with 5999980 stored elements in Compressed Sparse Column format>

SHA 256 pseuedocode?

I've been trying to work out how SHA-256 works. One thing I've been doing for other algorithms is I've worked out a sort of step by step pseudocode function for the algorithm.
I've tried to do the same for SHA256 but thus far I'm having quite a bit of trouble.
I've tried to work out how the Wikipedia diagram works but besides the text part explaining the functions I'm not sure I've got it right.
Here's what I have so far:
Input is an array 8 items long where each item is 32 bits.
Output is an array 8 items long where each item is 32 bits.
Calculate all the function boxes and store those values. I'll refer to them by function name.
Store input, right shifted by 32 bits, into output.
At this point, in the out array, E is the wrong value and A is empty
Store the function boxes.
now we need to calculate out E and out A.
note: I've replaced the modulo commands with a bitwise AND 2^(32-1)
I can't figure out how the modulus adding lines up, but I think it is like this:
Store (Input H + Ch + ( (Wt+Kt) AND 2^31 ) ) AND 2^31 As mod1
Store (sum1 + mod1) AND 2^31 as mod2
Store (d + mod2) AND 2^31 into output E
now output E is correct and all we need is output A
Store (MA + mod2) AND 2^31 as mod3
Store (sum0 + mod3) AND 2^31 into output A
output now contains the correct hash of input.
Do we return now or does this need to be run repeatedly?
Did I get all of those addition modulos right?
what are Wt and Kt?
Would this get run once, and you're done or does it need to be run a certain number of times, with the output being re-used as input?
Here's the link by the way.
http://en.wikipedia.org/wiki/SHA-2#Hash_function
Thanks alot,
Brian
Have a look at the official standard that describes the algorithm, the variables are described here: http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
(Oh, now I see I'm almost a year late with my answer, ah, never mind...)
W_t is derived from the current block being processed while K_t is a fixed constant determined by the iteration number. The compression function is repeated 64 times for each block in SHA256. There is a specific constant K_t and a derived value W_t for each iteration 0 <= t <= 63.
I have provided my own implementation of SHA256 using Python 3.6. The tuple K contains the 64 constant values of K_t. The Sha256 function shows how the value of W_t is computed in the list W. The implementation focuses on code clarity and not high-performance.
W = 32 #Number of bits in word
M = 1 << W
FF = M - 1 #0xFFFFFFFF (for performing addition mod 2**32)
#Constants from SHA256 definition
K = (0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2)
#Initial values for compression function
I = (0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19)
def RR(x, b):
'''
32-bit bitwise rotate right
'''
return ((x >> b) | (x << (W - b))) & FF
def Pad(W):
'''
Pads a message and converts to byte array
'''
mdi = len(W) % 64
L = (len(W) << 3).to_bytes(8, 'big') #Binary of len(W) in bits
npad = 55 - mdi if mdi < 56 else 119 - mdi #Pad so 64 | len; add 1 block if needed
return bytes(W, 'ascii') + b'\x80' + (b'\x00' * npad) + L #64 | 1 + npad + 8 + len(W)
def Sha256CF(Wt, Kt, A, B, C, D, E, F, G, H):
'''
SHA256 Compression Function
'''
Ch = (E & F) ^ (~E & G)
Ma = (A & B) ^ (A & C) ^ (B & C) #Major
S0 = RR(A, 2) ^ RR(A, 13) ^ RR(A, 22) #Sigma_0
S1 = RR(E, 6) ^ RR(E, 11) ^ RR(E, 25) #Sigma_1
T1 = H + S1 + Ch + Wt + Kt
return (T1 + S0 + Ma) & FF, A, B, C, (D + T1) & FF, E, F, G
def Sha256(M):
'''
Performs SHA256 on an input string
M: The string to process
return: A 32 byte array of the binary digest
'''
M = Pad(M) #Pad message so that length is divisible by 64
DG = list(I) #Digest as 8 32-bit words (A-H)
for j in range(0, len(M), 64): #Iterate over message in chunks of 64
S = M[j:j + 64] #Current chunk
W = [0] * 64
W[0:16] = [int.from_bytes(S[i:i + 4], 'big') for i in range(0, 64, 4)]
for i in range(16, 64):
s0 = RR(W[i - 15], 7) ^ RR(W[i - 15], 18) ^ (W[i - 15] >> 3)
s1 = RR(W[i - 2], 17) ^ RR(W[i - 2], 19) ^ (W[i - 2] >> 10)
W[i] = (W[i - 16] + s0 + W[i-7] + s1) & FF
A, B, C, D, E, F, G, H = DG #State of the compression function
for i in range(64):
A, B, C, D, E, F, G, H = Sha256CF(W[i], K[i], A, B, C, D, E, F, G, H)
DG = [(X + Y) & FF for X, Y in zip(DG, (A, B, C, D, E, F, G, H))]
return b''.join(Di.to_bytes(4, 'big') for Di in DG) #Convert to byte array
if __name__ == "__main__":
bd = Sha256('Hello World')
print(''.join('{:02x}'.format(i) for i in bd))
initial_hash_values=[
'6a09e667','bb67ae85','3c6ef372','a54ff53a',
'510e527f','9b05688c','1f83d9ab','5be0cd19'
]
sha_256_constants=[
'428a2f98','71374491','b5c0fbcf','e9b5dba5',
'3956c25b','59f111f1','923f82a4','ab1c5ed5',
'd807aa98','12835b01','243185be','550c7dc3',
'72be5d74','80deb1fe','9bdc06a7','c19bf174',
'e49b69c1','efbe4786','0fc19dc6','240ca1cc',
'2de92c6f','4a7484aa','5cb0a9dc','76f988da',
'983e5152','a831c66d','b00327c8','bf597fc7',
'c6e00bf3','d5a79147','06ca6351','14292967',
'27b70a85','2e1b2138','4d2c6dfc','53380d13',
'650a7354','766a0abb','81c2c92e','92722c85',
'a2bfe8a1','a81a664b','c24b8b70','c76c51a3',
'd192e819','d6990624','f40e3585','106aa070',
'19a4c116','1e376c08','2748774c','34b0bcb5',
'391c0cb3','4ed8aa4a','5b9cca4f','682e6ff3',
'748f82ee','78a5636f','84c87814','8cc70208',
'90befffa','a4506ceb','bef9a3f7','c67178f2'
]
def bin_return(dec):
return(str(format(dec,'b')))
def bin_8bit(dec):
return(str(format(dec,'08b')))
def bin_32bit(dec):
return(str(format(dec,'032b')))
def bin_64bit(dec):
return(str(format(dec,'064b')))
def hex_return(dec):
return(str(format(dec,'x')))
def dec_return_bin(bin_string):
return(int(bin_string,2))
def dec_return_hex(hex_string):
return(int(hex_string,16))
def L_P(SET,n):
to_return=[]
j=0
k=n
while k<len(SET)+1:
to_return.append(SET[j:k])
j=k
k+=n
return(to_return)
def s_l(bit_string):
bit_list=[]
for i in range(len(bit_string)):
bit_list.append(bit_string[i])
return(bit_list)
def l_s(bit_list):
bit_string=''
for i in range(len(bit_list)):
bit_string+=bit_list[i]
return(bit_string)
def rotate_right(bit_string,n):
bit_list = s_l(bit_string)
count=0
while count <= n-1:
list_main=list(bit_list)
var_0=list_main.pop(-1)
list_main=list([var_0]+list_main)
bit_list=list(list_main)
count+=1
return(l_s(list_main))
def shift_right(bit_string,n):
bit_list=s_l(bit_string)
count=0
while count <= n-1:
bit_list.pop(-1)
count+=1
front_append=['0']*n
return(l_s(front_append+bit_list))
def mod_32_addition(input_set):
value=0
for i in range(len(input_set)):
value+=input_set[i]
mod_32 = 4294967296
return(value%mod_32)
def xor_2str(bit_string_1,bit_string_2):
xor_list=[]
for i in range(len(bit_string_1)):
if bit_string_1[i]=='0' and bit_string_2[i]=='0':
xor_list.append('0')
if bit_string_1[i]=='1' and bit_string_2[i]=='1':
xor_list.append('0')
if bit_string_1[i]=='0' and bit_string_2[i]=='1':
xor_list.append('1')
if bit_string_1[i]=='1' and bit_string_2[i]=='0':
xor_list.append('1')
return(l_s(xor_list))
def and_2str(bit_string_1,bit_string_2):
and_list=[]
for i in range(len(bit_string_1)):
if bit_string_1[i]=='1' and bit_string_2[i]=='1':
and_list.append('1')
else:
and_list.append('0')
return(l_s(and_list))
def or_2str(bit_string_1,bit_string_2):
or_list=[]
for i in range(len(bit_string_1)):
if bit_string_1[i]=='0' and bit_string_2[i]=='0':
or_list.append('0')
else:
or_list.append('1')
return(l_s(or_list))
def not_str(bit_string):
not_list=[]
for i in range(len(bit_string)):
if bit_string[i]=='0':
not_list.append('1')
else:
not_list.append('0')
return(l_s(not_list))
'''
SHA-256 Specific Functions:
'''
def Ch(x,y,z):
return(xor_2str(and_2str(x,y),and_2str(not_str(x),z)))
def Maj(x,y,z):
return(xor_2str(xor_2str(and_2str(x,y),and_2str(x,z)),and_2str(y,z)))
def e_0(x):
return(xor_2str(xor_2str(rotate_right(x,2),rotate_right(x,13)),rotate_right(x,22)))
def e_1(x):
return(xor_2str(xor_2str(rotate_right(x,6),rotate_right(x,11)),rotate_right(x,25)))
def s_0(x):
return(xor_2str(xor_2str(rotate_right(x,7),rotate_right(x,18)),shift_right(x,3)))
def s_1(x):
return(xor_2str(xor_2str(rotate_right(x,17),rotate_right(x,19)),shift_right(x,10)))
def message_pad(bit_list):
pad_one = bit_list + '1'
pad_len = len(pad_one)
k=0
while ((pad_len+k)-448)%512 != 0:
k+=1
back_append_0 = '0'*k
back_append_1 = bin_64bit(len(bit_list))
return(pad_one+back_append_0+back_append_1)
def message_bit_return(string_input):
bit_list=[]
for i in range(len(string_input)):
bit_list.append(bin_8bit(ord(string_input[i])))
return(l_s(bit_list))
def message_pre_pro(input_string):
bit_main = message_bit_return(input_string)
return(message_pad(bit_main))
def message_parsing(input_string):
return(L_P(message_pre_pro(input_string),32))
def message_schedule(index,w_t):
new_word = bin_32bit(mod_32_addition([int(s_1(w_t[index-2]),2),int(w_t[index-7],2),int(s_0(w_t[index-15]),2),int(w_t[index-16],2)]))
return(new_word)
'''
This example of SHA_256 works for an input string >56 characters.
'''
def sha_256(input_string):
w_t=message_parsing(input_string)
a=bin_32bit(dec_return_hex(initial_hash_values[0]))
b=bin_32bit(dec_return_hex(initial_hash_values[1]))
c=bin_32bit(dec_return_hex(initial_hash_values[2]))
d=bin_32bit(dec_return_hex(initial_hash_values[3]))
e=bin_32bit(dec_return_hex(initial_hash_values[4]))
f=bin_32bit(dec_return_hex(initial_hash_values[5]))
g=bin_32bit(dec_return_hex(initial_hash_values[6]))
h=bin_32bit(dec_return_hex(initial_hash_values[7]))
for i in range(0,64):
if i <= 15:
t_1=mod_32_addition([int(h,2),int(e_1(e),2),int(Ch(e,f,g),2),int(sha_256_constants[i],16),int(w_t[i],2)])
t_2=mod_32_addition([int(e_0(a),2),int(Maj(a,b,c),2)])
h=g
g=f
f=e
e=mod_32_addition([int(d,2),t_1])
d=c
c=b
b=a
a=mod_32_addition([t_1,t_2])
a=bin_32bit(a)
e=bin_32bit(e)
if i > 15:
w_t.append(message_schedule(i,w_t))
t_1=mod_32_addition([int(h,2),int(e_1(e),2),int(Ch(e,f,g),2),int(sha_256_constants[i],16),int(w_t[i],2)])
t_2=mod_32_addition([int(e_0(a),2),int(Maj(a,b,c),2)])
h=g
g=f
f=e
e=mod_32_addition([int(d,2),t_1])
d=c
c=b
b=a
a=mod_32_addition([t_1,t_2])
a=bin_32bit(a)
e=bin_32bit(e)
hash_0 = mod_32_addition([dec_return_hex(initial_hash_values[0]),int(a,2)])
hash_1 = mod_32_addition([dec_return_hex(initial_hash_values[1]),int(b,2)])
hash_2 = mod_32_addition([dec_return_hex(initial_hash_values[2]),int(c,2)])
hash_3 = mod_32_addition([dec_return_hex(initial_hash_values[3]),int(d,2)])
hash_4 = mod_32_addition([dec_return_hex(initial_hash_values[4]),int(e,2)])
hash_5 = mod_32_addition([dec_return_hex(initial_hash_values[5]),int(f,2)])
hash_6 = mod_32_addition([dec_return_hex(initial_hash_values[6]),int(g,2)])
hash_7 = mod_32_addition([dec_return_hex(initial_hash_values[7]),int(h,2)])
final_hash = (hex_return(hash_0),
hex_return(hash_1),
hex_return(hash_2),
hex_return(hash_3),
hex_return(hash_4),
hex_return(hash_5),
hex_return(hash_6),
hex_return(hash_7))
return(final_hash)