I'm trying to permanently change fits headers using astropy hdu.writeto has added "None" to my fits file - astropy

I want to
read in a fits file
change some of the headers
save it, with the changes, to a new fits file.
So far I think I have achieved this, however the new fits file has "None" when I do hdu.info(). I'm confused what this means, why it's there, and is it bad?
I'll paste my code below with the associated outputs:
from astropy.io import fits
hdulist = fits.open('2D_comb_ff_wavcal_red_sci_2.fits')
hdu = hdulist[0]
hdulist.info()
Filename: 2D_comb_ff_wavcal_red_sci_2.fits
No. Name Ver Type Cards Dimensions Format
0 PRIMARY 1 PrimaryHDU 287 (1024, 1024) float32
print 'CRVAL1 then = ', hdu.header['CRVAL1']
print 'CRVAL2 then = ', hdu.header['CRVAL2']
CRVAL1 then = 14975.1660156
CRVAL2 then = 1.0
CRVAL1_orig = hdu.header['CRVAL1']
CRVAL2_orig = hdu.header['CRVAL2']
hdu.header['CRVAL1'] = CRVAL2_orig
hdu.header['CRVAL2'] = CRVAL1_orig
print 'CRVAL1 now = ', hdu.header['CRVAL1']
print 'CRVAL2 now = ', hdu.header['CRVAL2']
CRVAL1 now = 1.0
CRVAL2 now = 14975.1660156
hdu.writeto('newheader.fits', overwrite=True)
new = fits.open('newheader.fits')
print new.info()
Filename: newheader.fits
No. Name Ver Type Cards Dimensions Format
0 PRIMARY 1 PrimaryHDU 287 (1024, 1024) float32
None
The headers have updated (I checked) but why does it say "None" here? Is hdu.writeto() the wrong thing? I've tried reading about flush() but it confused me.
Thanks

That is because you are printing what is returned from .info(). Note in the fourth line you have
hdulist.info()
with no print, yet you get the output you expect. furthermore you do get properly
0 PRIMARY 1 PrimaryHDU 287 (1024, 1024) float32
when you call new.info(). So to sum, it is the method itself which prints, likely not returning anything or returning None by default, and since you print it, it gets displayed as a string.
Just remove the final print.

Related

Change number format in Excel using names of headers - openpyxl [duplicate]

I have an Excel (.xlsx) file that I'm trying to parse, row by row. I have a header (first row) that has a bunch of column titles like School, First Name, Last Name, Email, etc.
When I loop through each row, I want to be able to say something like:
row['School']
and get back the value of the cell in the current row and the column with 'School' as its title.
I've looked through the OpenPyXL docs but can't seem to find anything terribly helpful.
Any suggestions?
I'm not incredibly familiar with OpenPyXL, but as far as I can tell it doesn't have any kind of dict reader/iterator helper. However, it's fairly easy to iterate over the worksheet rows, as well as to create a dict from two lists of values.
def iter_worksheet(worksheet):
# It's necessary to get a reference to the generator, as
# `worksheet.rows` returns a new iterator on each access.
rows = worksheet.rows
# Get the header values as keys and move the iterator to the next item
keys = [c.value for c in next(rows)]
for row in rows:
values = [c.value for c in row]
yield dict(zip(keys, values))
Excel sheets are far more flexible than CSV files so it makes little sense to have something like DictReader.
Just create an auxiliary dictionary from the relevant column titles.
If you have columns like "School", "First Name", "Last Name", "EMail" you can create the dictionary like this.
keys = dict((value, idx) for (idx, value) in enumerate(values))
for row in ws.rows[1:]:
school = row[keys['School'].value
I wrote DictReader based on openpyxl. Save the second listing to file 'excel.py' and use it as csv.DictReader. See usage example in the first listing.
with open('example01.xlsx', 'rb') as source_data:
from excel import DictReader
for row in DictReader(source_data, sheet_index=0):
print(row)
excel.py:
__all__ = ['DictReader']
from openpyxl import load_workbook
from openpyxl.cell import Cell
Cell.__init__.__defaults__ = (None, None, '', None) # Change the default value for the Cell from None to `` the same way as in csv.DictReader
class DictReader(object):
def __init__(self, f, sheet_index,
fieldnames=None, restkey=None, restval=None):
self._fieldnames = fieldnames # list of keys for the dict
self.restkey = restkey # key to catch long rows
self.restval = restval # default value for short rows
self.reader = load_workbook(f, data_only=True).worksheets[sheet_index].iter_rows(values_only=True)
self.line_num = 0
def __iter__(self):
return self
#property
def fieldnames(self):
if self._fieldnames is None:
try:
self._fieldnames = next(self.reader)
self.line_num += 1
except StopIteration:
pass
return self._fieldnames
#fieldnames.setter
def fieldnames(self, value):
self._fieldnames = value
def __next__(self):
if self.line_num == 0:
# Used only for its side effect.
self.fieldnames
row = next(self.reader)
self.line_num += 1
# unlike the basic reader, we prefer not to return blanks,
# because we will typically wind up with a dict full of None
# values
while row == ():
row = next(self.reader)
d = dict(zip(self.fieldnames, row))
lf = len(self.fieldnames)
lr = len(row)
if lf < lr:
d[self.restkey] = row[lf:]
elif lf > lr:
for key in self.fieldnames[lr:]:
d[key] = self.restval
return d
The following seems to work for me.
header = True
headings = []
for row in ws.rows:
if header:
for cell in row:
headings.append(cell.value)
header = False
continue
rowData = dict(zip(headings, row))
wantedValue = rowData['myHeading'].value
I was running into the same issue as described above. Therefore I created a simple extension called openpyxl-dictreader that can be installed through pip. It is very similar to the suggestion made by #viktor earlier in this thread.
The package is largely based on source code of Python's native csv.DictReader class. It allows you to select items based on column names using openpyxl. For example:
import openpyxl_dictreader
reader = openpyxl_dictreader.DictReader("names.xlsx", "Sheet1")
for row in reader:
print(row["First Name"], row["Last Name"])
Putting this here for reference.

Is there a method for converting a winmids object to a mids object?

Suppose I create 10 multiply-imputed datasets and use the (wonderful) MatchThem package in R to create weights for my exposure variable. The MatchThem package takes a mids object and converts it to an object of the class winmids.
My desired output is a mids object - but with weights. I hope to pass this mids object to BRMS as follows:
library(brms)
m0 <- brm_multiple(Y|weights(weights) ~ A, data = mids_data)
Open to suggestions.
EDIT: Noah's solution below will unfortunately not work.
The package's first author, Farhad Pishgar, sent me the following elegant solution. It will create a mids object from a winmidsobject. Thank you Farhad!
library(mice)
library(MatchThem)
#"weighted.dataset" is our .wimids object
#Extracting the original dataset with missing value
maindataset <- complete(weighted.datasets, action = 0)
#Some spit-and-polish
maindataset <- data.frame(.imp = 0, .id = seq_len(nrow(maindataset)), maindataset)
#Extracting imputed-weighted datasets in the long format
alldataset <- complete(weighted.datasets, action = "long")
#Binding them together
alldataset <- rbind(maindataset, alldataset)
#Converting to .mids
newmids <- as.mids(alldataset)
Additionally, for BRMS, I worked out this solution which instead creates a list of dataframes. It will work in fewer steps.
library("mice")
library("dplyr")
library("MatchThem")
library("brms") # for bayesian estimation.
# Note, I realise that my approach here is not fully Bayesian, but that is a good thing! I need to ensure balance in the exposure.
# impute missing data
data("nhanes2")
imp <- mice(nhanes2, printFlag = FALSE, seed = 0, m = 10)
# MathThem. This is just a fast method
w_imp <- weightthem(hyp ~ chl + age, data = imp,
approach = "within",
estimand = "ATE",
method = "ps")
# get individual data frames with weights
out <- complete(w_imp, action ="long", include = FALSE, mild = TRUE)
# assemble individual data frames into a list
m <- 10
listdat<- list()
for (i in 1:m) {
listdat[[i]] <- as.data.frame(out[[i]])
}
# pass the list to brms, and it runs as it should!
fit_1 <- brm_multiple(bmi|weights(weights) ~ age + hyp + chl,
data = listdat,
backend = "cmdstanr",
family = "gaussian",
set_prior('normal(0, 1)',
class = 'b'))
brm_multiple() can take in a list of data frames for its data argument. You can produce this from the wimids object using complete(). The output of complete() with action = "all" is a mild object, which is a list of data frames, but this is not recognized by brm_multiple() as such. So, you can just convert it to a list. This should look like the following:
df_list <- complete(mids_data, "all")
class(df_list) <- "list"
m0 <- brm_multiple(Y|weights(weights) ~ A, data = df_list)
Using complete() automatically adds a weights column to the resulting imputed data frames.

Changing label name when retraining Inception on Google Cloud ML

I currently follow the tutorial to retrain Inception for image classification:
https://cloud.google.com/blog/big-data/2016/12/how-to-train-and-classify-images-using-google-cloud-machine-learning-and-cloud-dataflow
However, when I make a prediction with the API I get only the index of my class as a label. However I would like that the API actually gives me a string back with the actual class name e.g instead of
​predictions:
- key: '0'
prediction: 4
scores:
- 8.11998e-09
- 2.64907e-08
- 1.10307e-06
I would like to get:
​predictions:
- key: '0'
prediction: ROSES
scores:
- 8.11998e-09
- 2.64907e-08
- 1.10307e-06
Looking at the reference for the Google API it should be possible:
https://cloud.google.com/ml-engine/reference/rest/v1/projects/predict
I already tried to change in the model.py the following to
outputs = {
'key': keys.name,
'prediction': tensors.predictions[0].name,
'scores': tensors.predictions[1].name
}
tf.add_to_collection('outputs', json.dumps(outputs))
to
if tensors.predictions[0].name == 0:
pred_name ='roses'
elif tensors.predictions[0].name == 1:
pred_name ='tulips'
outputs = {
'key': keys.name,
'prediction': pred_name,
'scores': tensors.predictions[1].name
}
tf.add_to_collection('outputs', json.dumps(outputs))
but this doesn't work.
My next idea was to change this part in the preprocess.py file. So instead getting the index I want to use the string label.
def process(self, row, all_labels):
try:
row = row.element
except AttributeError:
pass
if not self.label_to_id_map:
for i, label in enumerate(all_labels):
label = label.strip()
if label:
self.label_to_id_map[label] = label #i
and
label_ids = []
for label in row[1:]:
try:
label_ids.append(label.strip())
#label_ids.append(self.label_to_id_map[label.strip()])
except KeyError:
unknown_label.inc()
but this gives the error:
TypeError: 'roses' has type <type 'str'>, but expected one of: (<type 'int'>, <type 'long'>) [while running 'Embed and make TFExample']
hence I thought that I should change something here in preprocess.py, in order to allow strings:
example = tf.train.Example(features=tf.train.Features(feature={
'image_uri': _bytes_feature([uri]),
'embedding': _float_feature(embedding.ravel().tolist()),
}))
if label_ids:
label_ids.sort()
example.features.feature['label'].int64_list.value.extend(label_ids)
But I don't know how to change it appropriately as I could not find someting like str_list. Could anyone please help me out here?
Online prediction certainly allows this, the model itself needs to be updated to do the conversion from int to string.
Keep in mind that the Python code is just building a graph which describes what computation to do in your model -- you're not sending the Python code to online prediction, you're sending the graph you build.
That distinction is important because the changes you have made are in Python -- you don't yet have any inputs or predictions, so you won't be able to inspect their values. What you need to do instead is add the equivalent lookups to the graph that you're exporting.
You could modify the code like so:
labels = tf.constant(['cars', 'trucks', 'suvs'])
predicted_indices = tf.argmax(softmax, 1)
prediction = tf.gather(labels, predicted_indices)
And leave the inputs/outputs untouched from the original code

How to format input data for textsum data_convert_example

I was hoping someone may be able to see where I am failing here. So I have scraped some data from buzzfeed and now I am trying to format a text file with which I can then send into data_convert_examples text_to_data formatter.
I thought I had the answer a couple times, but I am still running up against a brick wall when I process this as binary and then try to train against the data.
What I did was run the binary_to_text on the toy dataset and then opened the file in notepad++ under windows, showing all characters, and matched what I believed to be the format.
I appologize for the long function below, but I really am unsure as to where the issue might be and figured this was the best way to provide enough info. Anyone have any ideas or recommendations?
def processPath(self, toPath):
try:
fout = open(os.path.join(toPath, '{}-{}'.format(self.baseName, self.fileNdx)), 'a+')
for path, dirs, files in os.walk(self.fromPath):
for fn in files:
fullpath = os.path.join(path, fn)
if os.path.isfile(fullpath):
#with open(fullpath, "rb") as f:
with codecs.open(fullpath, "rb", 'ascii', "ignore") as f:
try:
finalRes = ""
content = f.readlines()
self.populateVocab(content)
sentences = sent_tokenize((content[1]).encode('ascii', "ignore").strip('\n'))
for sent in sentences:
textSumFmt = self.textsumFmt
finalRes = textSumFmt["artPref"] + textSumFmt["sentPref"] + sent.replace("=", "equals") + textSumFmt["sentPost"] + textSumFmt["postVal"]
finalRes += (('\t' + textSumFmt["absPref"] + textSumFmt["sentPref"] + (content[0]).strip('\n').replace("=", "equals") + textSumFmt["sentPost"] + textSumFmt["postVal"]) + '\t' +'publisher=BUZZ' + os.linesep)
if self.lineNdx != 0 and self.lineNdx % self.lines == 0:
fout.close()
self.fileNdx+=1
fout = open(os.path.join(toPath, '{}-{}'.format(self.baseName, self.fileNdx)), 'a+')
fout.write( ("{}").format( finalRes.encode('utf-8', "ignore") ) )
self.lineNdx+=1
except RuntimeError as e:
print "Runtime Error: {0} : {1}".format(e.errno, e.strerror)
finally:
fout.close()
After further analysis, it seems that the source of the problem is more with the source data and the way it is constructed rather than data_convert_example.py itself. I'm closing this as the heading is not in-line with the source of the issue.
I found the source of my problem was that I had a space between "Article" and the equals sign. After removing that I was able to successfully train.

Exporting a 3D numpy to a VTK file for viewing in Paraview/Mayavi

For those that want to export a simple 3D numpy array (along with axes) to a .vtk (or .vtr) file for post-processing and display in Paraview or Mayavi there's a little module called PyEVTK that does exactly that. The module supports structured and unstructured data etc..
Unfortunately, even though the code works fine in unix-based systems I couldn't make it work (keeps crashing) on any windows installation which simply makes things complicated. Ive contacted the developer but his suggestions did not work
Therefore my question is:
How can one use the from vtk.util import numpy_support function to export a 3D array (the function itself doesn't support 3D arrays) to a .vtk file? Is there a simple way to do it without creating vtkDatasets etc etc?
Thanks a lot!
It's been forever and I had entirely forgotten asking this question but I ended up figuring it out. I've written a post about it in my blog (PyScience) providing a tutorial on how to convert between NumPy and VTK. Do take a look if interested:
pyscience.wordpress.com/2014/09/06/numpy-to-vtk-converting-your-numpy-arrays-to-vtk-arrays-and-files/
It's not a direct answer to your question, but if you have tvtk (if you have mayavi, you should have it), you can use it to write your data to vtk format. (See: http://code.enthought.com/projects/files/ETS3_API/enthought.tvtk.misc.html )
It doesn't use PyEVTK, and it supports a broad range of data sources (more than just structured and unstructured grids), so it will probably work where other things aren't.
As a quick example (Mayavi's mlab interface can make this much less verbose, especially if you're already using it.):
import numpy as np
from enthought.tvtk.api import tvtk, write_data
data = np.random.random((10,10,10))
grid = tvtk.ImageData(spacing=(10, 5, -10), origin=(100, 350, 200),
dimensions=data.shape)
grid.point_data.scalars = np.ravel(order='F')
grid.point_data.scalars.name = 'Test Data'
# Writes legacy ".vtk" format if filename ends with "vtk", otherwise
# this will write data using the newer xml-based format.
write_data(grid, 'test.vtk')
And a portion of the output file:
# vtk DataFile Version 3.0
vtk output
ASCII
DATASET STRUCTURED_POINTS
DIMENSIONS 10 10 10
SPACING 10 5 -10
ORIGIN 100 350 200
POINT_DATA 1000
SCALARS Test%20Data double
LOOKUP_TABLE default
0.598189 0.228948 0.346975 0.948916 0.0109774 0.30281 0.643976 0.17398 0.374673
0.295613 0.664072 0.307974 0.802966 0.836823 0.827732 0.895217 0.104437 0.292796
0.604939 0.96141 0.0837524 0.498616 0.608173 0.446545 0.364019 0.222914 0.514992
...
...
TVTK of Mayavi has a beautiful way of writing vtk files. Here is a test example I have written for myself following #Joe and tvtk documentation. The advantage it has over evtk, is the support for both ascii and html.Hope it will help other people.
from tvtk.api import tvtk, write_data
import numpy as np
#data = np.random.random((3, 3, 3))
#
#i = tvtk.ImageData(spacing=(1, 1, 1), origin=(0, 0, 0))
#i.point_data.scalars = data.ravel()
#i.point_data.scalars.name = 'scalars'
#i.dimensions = data.shape
#
#w = tvtk.XMLImageDataWriter(input=i, file_name='spoints3d.vti')
#w.write()
points = np.array([[0,0,0], [1,0,0], [1,1,0], [0,1,0]], 'f')
(n1, n2) = points.shape
poly_edge = np.array([[0,1,2,3]])
print n1, n2
## Scalar Data
#temperature = np.array([10., 20., 30., 40.])
#pressure = np.random.rand(n1)
#
## Vector Data
#velocity = np.random.rand(n1,n2)
#force = np.random.rand(n1,n2)
#
##Tensor Data with
comp = 5
stress = np.random.rand(n1,comp)
#
#print stress.shape
## The TVTK dataset.
mesh = tvtk.PolyData(points=points, polys=poly_edge)
#
## Data 0 # scalar data
#mesh.point_data.scalars = temperature
#mesh.point_data.scalars.name = 'Temperature'
#
## Data 1 # additional scalar data
#mesh.point_data.add_array(pressure)
#mesh.point_data.get_array(1).name = 'Pressure'
#mesh.update()
#
## Data 2 # Vector data
#mesh.point_data.vectors = velocity
#mesh.point_data.vectors.name = 'Velocity'
#mesh.update()
#
## Data 3 additional vector data
#mesh.point_data.add_array( force)
#mesh.point_data.get_array(3).name = 'Force'
#mesh.update()
mesh.point_data.tensors = stress
mesh.point_data.tensors.name = 'Stress'
# Data 4 additional tensor Data
#mesh.point_data.add_array(stress)
#mesh.point_data.get_array(4).name = 'Stress'
#mesh.update()
write_data(mesh, 'polydata.vtk')
# XML format
# Method 1
#write_data(mesh, 'polydata')
# Method 2
#w = tvtk.XMLPolyDataWriter(input=mesh, file_name='polydata.vtk')
#w.write()
I know it is a bit late and I do love your tutorials #somada141. This should work too.
def numpy2VTK(img, spacing=[1.0, 1.0, 1.0]):
# evolved from code from Stou S.,
# on http://www.siafoo.net/snippet/314
# This function, as the name suggests, converts numpy array to VTK
importer = vtk.vtkImageImport()
img_data = img.astype('uint8')
img_string = img_data.tostring() # type short
dim = img.shape
importer.CopyImportVoidPointer(img_string, len(img_string))
importer.SetDataScalarType(VTK_UNSIGNED_CHAR)
importer.SetNumberOfScalarComponents(1)
extent = importer.GetDataExtent()
importer.SetDataExtent(extent[0], extent[0] + dim[2] - 1,
extent[2], extent[2] + dim[1] - 1,
extent[4], extent[4] + dim[0] - 1)
importer.SetWholeExtent(extent[0], extent[0] + dim[2] - 1,
extent[2], extent[2] + dim[1] - 1,
extent[4], extent[4] + dim[0] - 1)
importer.SetDataSpacing(spacing[0], spacing[1], spacing[2])
importer.SetDataOrigin(0, 0, 0)
return importer
Hope it helps!
Here's a SimpleITK version with the function load_itk taken from here:
import SimpleITK as sitk
import numpy as np
if len(sys.argv)<3:
print('Wrong number of arguments.', file=sys.stderr)
print('Usage: ' + __file__ + ' input_sitk_file' + ' output_sitk_file', file=sys.stderr)
sys.exit(1)
def quick_read(filename):
# Read image information without reading the bulk data.
file_reader = sitk.ImageFileReader()
file_reader.SetFileName(filename)
file_reader.ReadImageInformation()
print('image size: {0}\nimage spacing: {1}'.format(file_reader.GetSize(), file_reader.GetSpacing()))
# Some files have a rich meta-data dictionary (e.g. DICOM)
for key in file_reader.GetMetaDataKeys():
print(key + ': ' + file_reader.GetMetaData(key))
def load_itk(filename):
# Reads the image using SimpleITK
itkimage = sitk.ReadImage(filename)
# Convert the image to a numpy array first and then shuffle the dimensions to get axis in the order z,y,x
data = sitk.GetArrayFromImage(itkimage)
# Read the origin of the ct_scan, will be used to convert the coordinates from world to voxel and vice versa.
origin = np.array(list(reversed(itkimage.GetOrigin())))
# Read the spacing along each dimension
spacing = np.array(list(reversed(itkimage.GetSpacing())))
return data, origin, spacing
def convert(data, output_filename):
image = sitk.GetImageFromArray(data)
writer = sitk.ImageFileWriter()
writer.SetFileName(output_filename)
writer.Execute(image)
def wait():
print('Press Enter to load & convert or exit using Ctrl+C')
input()
quick_read(sys.argv[1])
print('-'*20)
wait()
data, origin, spacing = load_itk(sys.argv[1])
convert(sys.argv[2])