Break document sections into list for export Python - sql

I am very new to Python, and I am trying to break some legal documents into sections for export into SQL. I need to do two things:
Define the section numbers by the table of contents, and
Break up the document given the defined section numbers
The table of contents lists section numbers: 1.1, 1.2, 1.3, etc.
Then the document itself is broken up by those section numbers:
1.1 "...Text...",
1.2 "...Text...",
1.3 "...Text...", etc.
Similar to the chapters of a book, but delimited by ascending decimal numbers.
I have the document parsed using Tika, and I've been able to create a list of sections with some basic regex:
import tika
import re
from tika import parser
parsed = parser.from_file('test.pdf')
content = (parsed["content"])
headers = re.findall("[0-9]*[.][0-9]",content)
Now I need to do something like this:
splitsections = content.split() by headers
var_string = ', '.join('?' * len(splitsections))
query_string = 'INSERT INTO table VALUES (%s);' % var_string
cursor.execute(query_string, splitsections)
Sorry if all this is unclear. Still very new to this.
Any help you can provide would be most appreciated.

Everything tested except the last part with DB. Also the code can be improved, but this is another task. The main task is done.
In the list split_content there are all pieces of info you wanted (i.e. the text between 2.1 and 2.2, then 2.2 and 2.3, and so on, EXCLUDING num+name of sections itself (i.e. excluding 2.1 Continuation, 2.2 Name and so on).
I replaced tika by PyPDF2, as tika does not provide instruments needed for this task (i.e. I did not find how to provide the num of page I need and get its content).
def get_pdf_content(pdf_path,
start_page_table_contents, end_page_table_contents,
first_parsing_page, last_phrase_to_stop):
"""
:param pdf_path: Full path to the PDF file
:param start_page_table_contents: The page where the "Contents table" starts
:param end_page_table_contents: The page where the "Contents Table" ends
(i.e. the number of the page where Contents Table ENDs, i.e. not the next one)
:param first_parsing_page: The 1st page where we need to start data grabbing
:param last_phrase_to_stop: The phrase that tells the code where to stop grabbing.
The phrase must match exactly what is written in PDF.
This phrase will be excluded from the grabbed data.
:return:
"""
# ======== GRAB TABLE OF CONTENTS ========
start_page = start_page_table_contents
end_page = end_page_table_contents
table_of_contents_page_nums = range(start_page-1, end_page)
sections_of_articles = [] # ['2.1 Continuation', '2.2 Name', ... ]
open_file = open(pdf_path, "rb")
pdf = PyPDF2.PdfFileReader(open_file)
for page_num in table_of_contents_page_nums:
page_content = pdf.getPage(page_num).extractText()
page_sections = re.findall("[\d]+[.][\d][™\s\w;,-]+", page_content)
for section in page_sections:
cleared_section = section.replace('\n', '').strip()
sections_of_articles.append(cleared_section)
# ======== GRAB ALL NECESSARY CONTENT (MERGE ALL PAGES) ========
total_num_pages = pdf.getNumPages()
parsing_pages = range(first_parsing_page-1, total_num_pages)
full_parsing_content = '' # Merged pages
for parsing_page in parsing_pages:
page_content = pdf.getPage(parsing_page).extractText()
cleared_page = page_content.replace('\n', '')
# Remove page num from the start of "page_content"
# Covers the case with the page 65, 71 and others when the "page_content" starts
# with, for example, "616.6 Liability to Partners. (a) It is understood that"
# i.e. "61" is the page num and "6.6 Liability ..." is the section data
already_cleared = False
first_50_chars = cleared_page[:51]
for section in sections_of_articles:
if section in first_50_chars:
indx = cleared_page.index(section)
cleared_page = cleared_page[indx:]
already_cleared = True
break
# Covers all other cases
if not already_cleared:
page_num_to_remove = re.match(r'^\d+', cleared_page)
if page_num_to_remove:
cleared_page = cleared_page[len(str(page_num_to_remove.group(0))):]
full_parsing_content += cleared_page
# ======== BREAK ALL CONTENT INTO PIECES ACCORDING TO TABLE CONTENTS ========
split_content = []
num_sections = len(sections_of_articles)
for num_section in range(num_sections):
start = sections_of_articles[num_section]
# Get the last piece, i.e. "11.16 FATCA" (as there is no any "end" section after "11.16 FATCA", so we cant use
# the logic like "grab info between sections 11.1 and 11.2, 11.2 and 11.3 and so on")
if num_section == num_sections-1:
end = last_phrase_to_stop
else:
end = sections_of_articles[num_section + 1]
content = re.search('%s(.*)%s' % (start, end), full_parsing_content).group(1)
cleared_piece = content.replace('™', "'").strip()
if cleared_piece[0:3] == '. ':
cleared_piece = cleared_piece[3:]
# There are few appearances of "[Signature Page Follows]", as a "last_phrase_to_stop".
# We need the text between "11.16 FATCA" and the 1st appearance of "[Signature Page Follows]"
try:
indx = cleared_piece.index(end)
cleared_piece = cleared_piece[:indx]
except ValueError:
pass
split_content.append(cleared_piece)
# ======== INSERT TO DB ========
# Did not test this section
for piece in split_content:
var_string = ', '.join('?' * len(piece))
query_string = 'INSERT INTO table VALUES (%s);' % var_string
cursor.execute(query_string, parts)
How to use: (one of the possible way):
1) Save the code above in my_pdf_code.py
2) In the python shell:
import path.to.my_pdf_code as the_code
the_code.get_pdf_content('/home/username/Apollo_Investment_Fund_VIII_LPA_S1.pdf', 2, 4, 24, '[Signature Page Follows]')

Related

Change number format in Excel using names of headers - openpyxl [duplicate]

I have an Excel (.xlsx) file that I'm trying to parse, row by row. I have a header (first row) that has a bunch of column titles like School, First Name, Last Name, Email, etc.
When I loop through each row, I want to be able to say something like:
row['School']
and get back the value of the cell in the current row and the column with 'School' as its title.
I've looked through the OpenPyXL docs but can't seem to find anything terribly helpful.
Any suggestions?
I'm not incredibly familiar with OpenPyXL, but as far as I can tell it doesn't have any kind of dict reader/iterator helper. However, it's fairly easy to iterate over the worksheet rows, as well as to create a dict from two lists of values.
def iter_worksheet(worksheet):
# It's necessary to get a reference to the generator, as
# `worksheet.rows` returns a new iterator on each access.
rows = worksheet.rows
# Get the header values as keys and move the iterator to the next item
keys = [c.value for c in next(rows)]
for row in rows:
values = [c.value for c in row]
yield dict(zip(keys, values))
Excel sheets are far more flexible than CSV files so it makes little sense to have something like DictReader.
Just create an auxiliary dictionary from the relevant column titles.
If you have columns like "School", "First Name", "Last Name", "EMail" you can create the dictionary like this.
keys = dict((value, idx) for (idx, value) in enumerate(values))
for row in ws.rows[1:]:
school = row[keys['School'].value
I wrote DictReader based on openpyxl. Save the second listing to file 'excel.py' and use it as csv.DictReader. See usage example in the first listing.
with open('example01.xlsx', 'rb') as source_data:
from excel import DictReader
for row in DictReader(source_data, sheet_index=0):
print(row)
excel.py:
__all__ = ['DictReader']
from openpyxl import load_workbook
from openpyxl.cell import Cell
Cell.__init__.__defaults__ = (None, None, '', None) # Change the default value for the Cell from None to `` the same way as in csv.DictReader
class DictReader(object):
def __init__(self, f, sheet_index,
fieldnames=None, restkey=None, restval=None):
self._fieldnames = fieldnames # list of keys for the dict
self.restkey = restkey # key to catch long rows
self.restval = restval # default value for short rows
self.reader = load_workbook(f, data_only=True).worksheets[sheet_index].iter_rows(values_only=True)
self.line_num = 0
def __iter__(self):
return self
#property
def fieldnames(self):
if self._fieldnames is None:
try:
self._fieldnames = next(self.reader)
self.line_num += 1
except StopIteration:
pass
return self._fieldnames
#fieldnames.setter
def fieldnames(self, value):
self._fieldnames = value
def __next__(self):
if self.line_num == 0:
# Used only for its side effect.
self.fieldnames
row = next(self.reader)
self.line_num += 1
# unlike the basic reader, we prefer not to return blanks,
# because we will typically wind up with a dict full of None
# values
while row == ():
row = next(self.reader)
d = dict(zip(self.fieldnames, row))
lf = len(self.fieldnames)
lr = len(row)
if lf < lr:
d[self.restkey] = row[lf:]
elif lf > lr:
for key in self.fieldnames[lr:]:
d[key] = self.restval
return d
The following seems to work for me.
header = True
headings = []
for row in ws.rows:
if header:
for cell in row:
headings.append(cell.value)
header = False
continue
rowData = dict(zip(headings, row))
wantedValue = rowData['myHeading'].value
I was running into the same issue as described above. Therefore I created a simple extension called openpyxl-dictreader that can be installed through pip. It is very similar to the suggestion made by #viktor earlier in this thread.
The package is largely based on source code of Python's native csv.DictReader class. It allows you to select items based on column names using openpyxl. For example:
import openpyxl_dictreader
reader = openpyxl_dictreader.DictReader("names.xlsx", "Sheet1")
for row in reader:
print(row["First Name"], row["Last Name"])
Putting this here for reference.

combine two lists to PCollection

I'm using Apache Beam. When writing to tfRecord I need to include the ID of the item along with its text and embedding.
The tutorial works with just one list of text but I also have a list of the IDs to match the list of text so I was wondering how I could pass the ID to the following function:
def to_tf_example(entries):
examples = []
text_list, embedding_list = entries
for i in range(len(text_list)):
text = text_list[i]
embedding = embedding_list[i]
features = {
# need to pass in ID here like so:
'id': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[ids.encode('utf-8')])),
'text': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[text.encode('utf-8')])),
'embedding': tf.train.Feature(
float_list=tf.train.FloatList(value=embedding.tolist()))
}
example = tf.train.Example(
features=tf.train.Features(
feature=features)).SerializeToString(deterministic=True)
examples.append(example)
return examples
My first thought was just to include the ids in the text column of my database and then extract them via slicing or regex or something but was wondering if there was a better way, I assume converting to a PCollection but don't know where to start. Here is the pipeline:
with beam.Pipeline(args.runner, options=options) as pipeline:
query_data = pipeline | 'Read data from BigQuery' >>
beam.io.Read(beam.io.BigQuerySource(project='my-project', query=get_data(args.limit), use_standard_sql=True))
# list of texts
text = query_data | 'get list of text' >> beam.Map(lambda x: x['text'])
# list of ids
ids = query_data | 'get list of ids' >> beam.Map(lambda x: x['id'])
( text
| 'Batch elements' >> util.BatchElements(
min_batch_size=args.batch_size, max_batch_size=args.batch_size)
| 'Generate embeddings' >> beam.Map(
generate_embeddings, args.module_url, args.random_projection_matrix)
| 'Encode to tf example' >> beam.FlatMap(to_tf_example)
| 'Write to TFRecords files' >> beam.io.WriteToTFRecord(
file_path_prefix='{0}'.format(args.output_dir),
file_name_suffix='.tfrecords')
)
query_data | 'Convert to entity and write to datastore' >> beam.Map(
lambda input_features: create_entity(
input_features, args.kind))
I altered generate_embeddings to return List[int], List[string], List[List[float]] and then used the following function to pass the list of ids and text in:
def generate_embeddings_for_batch(batch, module_url, random_projection_matrix):
embeddings = generate_embeddings([x['id'] for x in batch], [x['text'] for x in batch], module_url, random_projection_matrix)
return embeddings
Here I'll assume generate_embeddings has the signature List[str], ... -> (List[str], List[List[float]])
What you want to do is avoid splitting your texts and ids into separate PCollections. So you might want to write something like
def generate_embeddings_for_batch(
batch,
module_url,
random_projection_matrix) -> Tuple[int, str, List[float]]:
embeddings = generate_embeddings(
[x['text'] for x in batch], module_url, random_projection_matrix)
text_to_embedding = dict(embeddings)
for id, text in batch:
yield x['id'], x['text'], text_to_embedding[x['text']]
From there you should be able to write to_tf_example.
It would probably make sense to look at using TFX.

Cannot replace spaCy lemmatized pronouns (-PRON-) through text

I'm trying to lemmatise a text with spaCy. Since spaCy uses -PRON- as lemma for personal pronouns, I want to keep the original text in all those cases.
Here's the relevant section of my code:
...
fout = open('test.txt', 'w+')
doc = nlp(text)
for word in doc:
if word.lemma_ == "-PRON-":
write = word.text
print(write)
else:
write = word.lemma_
fout.write(str(write))
fout.write(" ")
...
The print statement does print the original words for the cases where spaCy attributes the lemma '-PRON-'.
However, my output file (test.txt) always contains '-PRON-' for those cases, even though I would expect it to write the original words for those cases (I, us etc.)
What am I missing?
I tried different versions, including using the pos_ tag to identify the pronouns etc. but always with the same result, i.e., that my output contains '-PRON-'s
Try this somewhat altered code snipped to see what you get...
import spacy
nlp = spacy.load('en_core_web_sm')
text = 'Did he write the code for her?'
doc = nlp(text)
out_sent = [w.lemma_ if w.lemma_ !='-PRON-' else w.text for w in doc]
out_sent = ' '.join(out_sent)
print(out_sent)
with open('out_sent.txt', 'w') as f:
f.write(out_sent + '\n')
This should produce...
do he write the code for her ?

How to split a PDF every n page using PyPDF2?

I'm trying to learn how to split a pdf every n page.
In my case I want to split a 64p PDF into several chunks containing four pages each: file 1: p.1-4, file 2: p.5-8 etc.
I'm trying to understand PyPDF2 but my noobness overwhelms me:
from PyPDF2 import PdfFileWriter, PdfFileReader
pdf = PdfFileReader('my_pdf.pdf')
I guess I need to make a loop of sorts using addPage and write files till there's no pages left?
Little late but I ran into your question while looking for help trying to do the same thing.
I ended up doing the following, which does what you're asking. Mind you it's probably more than you're asking for, but the answer is in there. It's a rough first draft, in heavy need of refactoring and some variable renaming.
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def split_pdf(in_pdf, step=1):
"""Splits a given pdf into seperate pdfs and saves
those to a supfolder of the parent pdf's folder, called
splitted_pdf.
Arguments:
in_pdf: [str] Absolute path (and filename) of the
input pdf or just the filename, if the file
is in the current directory.
step: [int] Desired number of pages in each of the
output pdfs.
Returns:
dunno yet
"""
#TODO: Add choice for output dir
#TODO: Add logging instead of prints
#TODO: Refactor
try:
with open(in_pdf, 'rb') as in_file:
input_pdf = PdfFileReader(in_file)
num_pages = input_pdf.numPages
input_dir, filename = os.path.split(in_pdf)
filename = os.path.splitext(filename)[0]
output_dir = input_dir + "/" + filename + "_splitted/"
os.mkdir(output_dir)
intervals = range(0, num_pages, step)
intervals = dict(enumerate(intervals, 1))
naming = f'{filename}_p'
count = 0
for key, val in intervals.items():
output_pdf = PdfFileWriter()
if key == len(intervals):
for i in range(val, num_pages):
output_pdf.addPage(input_pdf.getPage(i))
nums = f'{val + 1}' if step == 1 else f'{val + 1}-{val + step}'
with open(f'{output_dir}{naming}{nums}.pdf', 'wb') as outfile:
output_pdf.write(outfile)
print(f'{naming}{nums}.pdf written to {output_dir}')
count += 1
else:
for i in range(val, intervals[key + 1]):
output_pdf.addPage(input_pdf.getPage(i))
nums = f'{val + 1}' if step == 1 else f'{val + 1}-{val + step}'
with open(f'{output_dir}{naming}{nums}.pdf', 'wb') as outfile:
output_pdf.write(outfile)
print(f'{naming}{nums}.pdf written to {output_dir}')
count += 1
except FileNotFoundError as err:
print('Cannot find the specified file. Check your input:')
print(f'{count} pdf files written to {output_dir}')
Hope it helps you.
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
# Method to split the pdf at every given n pages.
def split_at_every(self,infile , step = 1):
# Copy the input file path to a local variable infile
input_pdf = PdfFileReader(open(infile, "rb"))
pdf_len = input_pdf.number_of_pages
# Get the complete file name along with its path and split the text to take only the first part.
fname = os.path.splitext(os.path.basename(infile))[0]
# Get the list of page numbers in the order of given step
# If there are 10 pages in a pdf, and the step is 2
# page_numbers = [0,2,4,6,8]
page_numbers = list(range(0,pdf_len,step))
# Loop through the pdf pages
for ind,val in enumerate(page_numbers):
# Check if the index is last in the given page numbers
# If the index is not the last one, carry on with the If block.
if(ind+1 != len(page_numbers)):
# Initialize the PDF Writer
output_1 = PdfFileWriter()
# Loop through the pdf pages starting from the value of current index till the value of next index
# Ex : page numbers = [0,2,4,6,8]
# If the current index is 0, loop from 1st page till the 2nd page in the pdf doc.
for page in range(page_numbers[ind], page_numbers[ind+1]):
# Get the data from the given page number
page_data = input_pdf.getPage(page)
# Add the page data to the pdf_writer
output_1.addPage(page_data)
# Frame the output file name
output_1_filename = '{}_page_{}.pdf'.format(fname, page + 1)
# Write the output content to the file and save it.
self.write_to_file(output_1_filename, output_1)
else:
output_final = PdfFileWriter()
output_final_filename = "Last_Pages"
# Loop through the pdf pages starting from the value of current index till the last page of the pdf doc.
# Ex : page numbers = [0,2,4,6,8]
# If the current index is 8, loop from 8th page till the last page in the pdf doc.
for page in range(page_numbers[ind], pdf_len):
# Get the data from the given page number
page_data = input_pdf.getPage(page)
# Add the page data to the pdf_writer
output_final.addPage(page_data)
# Frame the output file name
output_final_filename = '{}_page_{}.pdf'.format(fname, page + 1)
# Write the output content to the file and save it.
self.write_to_file(output_final_filename,output_final)

Exporting a 3D numpy to a VTK file for viewing in Paraview/Mayavi

For those that want to export a simple 3D numpy array (along with axes) to a .vtk (or .vtr) file for post-processing and display in Paraview or Mayavi there's a little module called PyEVTK that does exactly that. The module supports structured and unstructured data etc..
Unfortunately, even though the code works fine in unix-based systems I couldn't make it work (keeps crashing) on any windows installation which simply makes things complicated. Ive contacted the developer but his suggestions did not work
Therefore my question is:
How can one use the from vtk.util import numpy_support function to export a 3D array (the function itself doesn't support 3D arrays) to a .vtk file? Is there a simple way to do it without creating vtkDatasets etc etc?
Thanks a lot!
It's been forever and I had entirely forgotten asking this question but I ended up figuring it out. I've written a post about it in my blog (PyScience) providing a tutorial on how to convert between NumPy and VTK. Do take a look if interested:
pyscience.wordpress.com/2014/09/06/numpy-to-vtk-converting-your-numpy-arrays-to-vtk-arrays-and-files/
It's not a direct answer to your question, but if you have tvtk (if you have mayavi, you should have it), you can use it to write your data to vtk format. (See: http://code.enthought.com/projects/files/ETS3_API/enthought.tvtk.misc.html )
It doesn't use PyEVTK, and it supports a broad range of data sources (more than just structured and unstructured grids), so it will probably work where other things aren't.
As a quick example (Mayavi's mlab interface can make this much less verbose, especially if you're already using it.):
import numpy as np
from enthought.tvtk.api import tvtk, write_data
data = np.random.random((10,10,10))
grid = tvtk.ImageData(spacing=(10, 5, -10), origin=(100, 350, 200),
dimensions=data.shape)
grid.point_data.scalars = np.ravel(order='F')
grid.point_data.scalars.name = 'Test Data'
# Writes legacy ".vtk" format if filename ends with "vtk", otherwise
# this will write data using the newer xml-based format.
write_data(grid, 'test.vtk')
And a portion of the output file:
# vtk DataFile Version 3.0
vtk output
ASCII
DATASET STRUCTURED_POINTS
DIMENSIONS 10 10 10
SPACING 10 5 -10
ORIGIN 100 350 200
POINT_DATA 1000
SCALARS Test%20Data double
LOOKUP_TABLE default
0.598189 0.228948 0.346975 0.948916 0.0109774 0.30281 0.643976 0.17398 0.374673
0.295613 0.664072 0.307974 0.802966 0.836823 0.827732 0.895217 0.104437 0.292796
0.604939 0.96141 0.0837524 0.498616 0.608173 0.446545 0.364019 0.222914 0.514992
...
...
TVTK of Mayavi has a beautiful way of writing vtk files. Here is a test example I have written for myself following #Joe and tvtk documentation. The advantage it has over evtk, is the support for both ascii and html.Hope it will help other people.
from tvtk.api import tvtk, write_data
import numpy as np
#data = np.random.random((3, 3, 3))
#
#i = tvtk.ImageData(spacing=(1, 1, 1), origin=(0, 0, 0))
#i.point_data.scalars = data.ravel()
#i.point_data.scalars.name = 'scalars'
#i.dimensions = data.shape
#
#w = tvtk.XMLImageDataWriter(input=i, file_name='spoints3d.vti')
#w.write()
points = np.array([[0,0,0], [1,0,0], [1,1,0], [0,1,0]], 'f')
(n1, n2) = points.shape
poly_edge = np.array([[0,1,2,3]])
print n1, n2
## Scalar Data
#temperature = np.array([10., 20., 30., 40.])
#pressure = np.random.rand(n1)
#
## Vector Data
#velocity = np.random.rand(n1,n2)
#force = np.random.rand(n1,n2)
#
##Tensor Data with
comp = 5
stress = np.random.rand(n1,comp)
#
#print stress.shape
## The TVTK dataset.
mesh = tvtk.PolyData(points=points, polys=poly_edge)
#
## Data 0 # scalar data
#mesh.point_data.scalars = temperature
#mesh.point_data.scalars.name = 'Temperature'
#
## Data 1 # additional scalar data
#mesh.point_data.add_array(pressure)
#mesh.point_data.get_array(1).name = 'Pressure'
#mesh.update()
#
## Data 2 # Vector data
#mesh.point_data.vectors = velocity
#mesh.point_data.vectors.name = 'Velocity'
#mesh.update()
#
## Data 3 additional vector data
#mesh.point_data.add_array( force)
#mesh.point_data.get_array(3).name = 'Force'
#mesh.update()
mesh.point_data.tensors = stress
mesh.point_data.tensors.name = 'Stress'
# Data 4 additional tensor Data
#mesh.point_data.add_array(stress)
#mesh.point_data.get_array(4).name = 'Stress'
#mesh.update()
write_data(mesh, 'polydata.vtk')
# XML format
# Method 1
#write_data(mesh, 'polydata')
# Method 2
#w = tvtk.XMLPolyDataWriter(input=mesh, file_name='polydata.vtk')
#w.write()
I know it is a bit late and I do love your tutorials #somada141. This should work too.
def numpy2VTK(img, spacing=[1.0, 1.0, 1.0]):
# evolved from code from Stou S.,
# on http://www.siafoo.net/snippet/314
# This function, as the name suggests, converts numpy array to VTK
importer = vtk.vtkImageImport()
img_data = img.astype('uint8')
img_string = img_data.tostring() # type short
dim = img.shape
importer.CopyImportVoidPointer(img_string, len(img_string))
importer.SetDataScalarType(VTK_UNSIGNED_CHAR)
importer.SetNumberOfScalarComponents(1)
extent = importer.GetDataExtent()
importer.SetDataExtent(extent[0], extent[0] + dim[2] - 1,
extent[2], extent[2] + dim[1] - 1,
extent[4], extent[4] + dim[0] - 1)
importer.SetWholeExtent(extent[0], extent[0] + dim[2] - 1,
extent[2], extent[2] + dim[1] - 1,
extent[4], extent[4] + dim[0] - 1)
importer.SetDataSpacing(spacing[0], spacing[1], spacing[2])
importer.SetDataOrigin(0, 0, 0)
return importer
Hope it helps!
Here's a SimpleITK version with the function load_itk taken from here:
import SimpleITK as sitk
import numpy as np
if len(sys.argv)<3:
print('Wrong number of arguments.', file=sys.stderr)
print('Usage: ' + __file__ + ' input_sitk_file' + ' output_sitk_file', file=sys.stderr)
sys.exit(1)
def quick_read(filename):
# Read image information without reading the bulk data.
file_reader = sitk.ImageFileReader()
file_reader.SetFileName(filename)
file_reader.ReadImageInformation()
print('image size: {0}\nimage spacing: {1}'.format(file_reader.GetSize(), file_reader.GetSpacing()))
# Some files have a rich meta-data dictionary (e.g. DICOM)
for key in file_reader.GetMetaDataKeys():
print(key + ': ' + file_reader.GetMetaData(key))
def load_itk(filename):
# Reads the image using SimpleITK
itkimage = sitk.ReadImage(filename)
# Convert the image to a numpy array first and then shuffle the dimensions to get axis in the order z,y,x
data = sitk.GetArrayFromImage(itkimage)
# Read the origin of the ct_scan, will be used to convert the coordinates from world to voxel and vice versa.
origin = np.array(list(reversed(itkimage.GetOrigin())))
# Read the spacing along each dimension
spacing = np.array(list(reversed(itkimage.GetSpacing())))
return data, origin, spacing
def convert(data, output_filename):
image = sitk.GetImageFromArray(data)
writer = sitk.ImageFileWriter()
writer.SetFileName(output_filename)
writer.Execute(image)
def wait():
print('Press Enter to load & convert or exit using Ctrl+C')
input()
quick_read(sys.argv[1])
print('-'*20)
wait()
data, origin, spacing = load_itk(sys.argv[1])
convert(sys.argv[2])