I'm processing a file and I'd like to remove (trim) the first X header lines to keep only data, possibly avoiding using regular expressions.
Thanks
You can remove the first X header lines by using ExecuteScript procesor in Nifi.
The following is a example Jython script which I wrote for myself:
import json
import java.io
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import StreamCallback
class PyStreamCallback(StreamCallback):
def __init__(self):
pass
def process(self, inputStream, outputStream):
text = IOUtils.readLines(inputStream, StandardCharsets.UTF_8)
for line in text[3:]:
outputStream.write(line + "\n")
flowFile = session.get()
if (flowFile != None):
flowFile = session.write(flowFile,PyStreamCallback())
flowFile = session.putAttribute(flowFile, "filename", flowFile.getAttribute('filename').split('.')[0]+'_translated.json')
session.transfer(flowFile, REL_SUCCESS)
This obviously removes the first 3 lines but you can easily modify it to remove more or less lines.
Hope that helps.
Related
Trying to use urllib.request to read a list of urls from a shapefile, then download the zips from all those URLs. So far I got my list of a certain number of URLs, but I am unable to pass all of them through. The error is expected string or bytes-like object. Meaning theres prob an issue with the URL. As a side note, I also need to download them and name them by their file name/#. Need help!! Code below.
import arcpy
import urllib.request
import os
os.chdir('C:\\ProgInGIS\\FinalExam\\Final')
lidar_shp = 'C:\\ProgInGIS\\FinalExam\\Final\\lidar-2013.shp'
zip_file_download = 'C:\\ProgInGIS\\FinalExam\\Final\\file1.zip'
data = []
with arcpy.da.SearchCursor(lidar_shp,"*") as cursor:
for row in cursor:
data.append(row)
data.sort(key=lambda tup: tup[2])
i = 0
with arcpy.da.UpdateCursor(lidar_shp,"*") as cursor:
for row in cursor:
row = data[i]
i += 1
cursor.updateRow(row)
counter = 0
url_list = []
with arcpy.da.UpdateCursor(lidar_shp, ['geotiff_ur']) as cursor:
for row in cursor:
url_list.append(row)
counter += 1
if counter == 18:
break
for item in url_list:
print(item)
urllib.request.urlretrieve(item)
I understand your question this way: you want to download a zip file for each record in a shapefile from an URL defined in a certain field.
It's easier to use the requests package which is also recommended in the urllib.request documentation:
The Requests package is recommended for a higher-level HTTP client interface.
Here is an example:
import arcpy, arcpy.da
import shutil
import requests
SHAPEFILE = "your_shapefile.shp"
with arcpy.da.SearchCursor(SHAPEFILE, ["name", "url"]) as cursor:
for name, url in cursor:
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(f"{name}.zip", "wb") as file:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, file)
There is another example on GIS StackExchange:
https://gis.stackexchange.com/a/392463/21355
So, this should have been simpler, and in the end it was quite simple, but the Scrapy documentations do leave some finding out to do ... So anyway this is a Q + A:
How to right Scrapy items "as is" line by line to a text file?
Basically you need to register an item exporter, and then tell Scrapy that you want to use it from command line:
Create a file named: lines_exporter.py:
from scrapy.exporters import BaseItemExporter
from scrapy.utils.python import to_bytes
class LinesExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(dont_fail=True, **kwargs)
self.file = file
self._kwargs.setdefault('ensure_ascii', not self.encoding)
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
data = ', '.join(itemdict.values()) + '\n'
self.file.write(to_bytes(data, self.encoding))
Add/edit the following in your settings.py:
FEED_EXPORTERS = {
'lines': 'project_name.lines_exporter.LinesExporter',
}
When invoking Scrapy from command line, specify the output-format flag, or:
-t lines
Enjoy!
from rest_framework import status, response
from rest_framework.test import APITestCase
from lots.models import Lot
class LotsTestCase(APITestCase):
def setUp(self) -> None:
self.lot = Lot.objects.create(name="1",
address="Dont Know",
phone_num="010-4451-2211",
latitude=127.12,
longitude=352.123,
basic_rate=20000,
additional_rate=2000,
partnership=False,
section_count=3,)
def test_delete(self):
response = self.client.delete(f'api/lots/{self.lot["name"]}')
# response = self.client.delete(f'/api/users/{self.users[0].pk}')
# url = reverse(f'/api/lots/{self.lot}', kwargs={'pk': self.lot.pk})
# self.client.delete(url)
self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT)
self.assertEqual(self.lot.objects.filter(pk=self.lot.pk.count()))
I have problems with the test code above. Why doesn't it work? I know it has to do with calling dictionary values but I just can't figure it out. Thanks for your help.
Lot.objects.create(...) returns a Lot type so you access name by self.lot.name.
Problem: I cannot replace <br> tags with a newline character using Beautiful Soup 4.
Code: My program (the relevant portion of it) currently looks like
for br in board.select('br'):
br.replace_with('\n')
but I have also tried board.find_all() in place of board.select().
Results: When I use board.replace_with('\n') all <br> tags are replaced with the string literal \n. For example, <p>Hello<br>world</p> would end up becoming Hello\nworld. Using board.replace_with(\n) causes the error
File "<ipython-input-27-cdfade950fdf>", line 10
br.replace_with(\n)
^
SyntaxError: unexpected character after line continuation character
Other Information: I am using a Jupyter Notebook, if that is of any relevance. Here is my full program, as there may be some issue elsewhere that I have overlooked.
import requests
from bs4 import BeautifulSoup
import pandas as pd
page = requests.get("https://boards.4chan.org/g/")
soup = BeautifulSoup(page.content, 'html.parser')
board = soup.find('div', class_='board')
for br in board.select('br'):
br.replace_with('\n')
message = [obj.get_text() for obj in board.select('.opContainer .postMessage')]
image = [obj['href'] for obj in board.select('.opContainer .fileThumb')]
pid = [obj.get_text() for obj in board.select('.opContainer .postInfo .postNum a[title="Reply to this post"]')]
time = [obj.get_text() for obj in board.select('.opContainer .postInfo .dateTime')]
for x in range(len(image)):
image[x] = "https:" + image[x]
post = pd.DataFrame({
"ID": pid,
"Time": time,
"Image": image,
"Message": message,
})
post
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)
display(post)
Any advice would be appreciated. Thank you for reading.
Just tried and it works for me, my bs4 version is 4.8.0, and I am using Python 3.5.3,
example:
from bs4 import BeautifulSoup
soup = BeautifulSoup('hello<br>world')
for br in soup('br'):
br.replace_with('\n')
# <br> was replaced with \n successfully
assert str(soup) == '<html><body><p>hello\nworld</p></body></html>'
# get_text() also works as expected
assert soup.get_text() == 'hello\nworld'
# it is a \n not a \\n
assert soup.get_text() != 'hello\\nworld'
I am not used to work with Jupyter Notebook, but seems that your problem is that whatever you are using to visualize the data is showing you the string representation instead of actually printing the string,
hope this helps,
Regards,
adb
Instead of replacing after converting to soup, try replacing the <br> tags before converting. Like,
soup = BeautifulSoup(str(page.content).replace('<br>', '\n'), 'html.parser')
Hope this helps! Cheers!
P.S.: I did not get any logical reason why this is not working after changing into soup.
After experimenting with variations of
page = requests.get("https://boards.4chan.org/g/")
str_page = page.content.decode()
str_split = '\n<'.join(str_page.split('<'))
str_split = '>\n'.join(str_split.split('>'))
str_split = str_split.replace('\n', '')
str_split = str_split.replace('<br>', ' ')
soup = BeautifulSoup(str_split.encode(), 'html.parser')
for the better part of two hours, I have determined that the Panda data-frame prints the newline character as a string literal. Everything else indicates that the program is working as intended, so I assume this has been the problem all along.
for some reason direct replace with newline does not work with bs4 you have to first replace with some other unique character (character sequence preferably) and then replace that sequence in text with newline.
try this.
for br in soup.find_all('br'): br.replace_with('+++')
text=soup.get_text().replace('+++','\n)
I tried to program an app in tkinter that would load random lines from a file you select from a pull down menu and display the selected line in a text window.
It seems like the variable "var" in insert_text does not return the selected "option" but rather an "empty" string resulting in a the following error:
"File not found error" (FileNotFoundError: [Errno2] No such file or
directory: '').
Please help!
#!/usr/bin/env python
# Python 3
import tkinter
from tkinter import ttk
import random
class Application:
def __init__(self, root):
self.root = root
self.root.title('Random Stuff')
ttk.Frame(self.root, width=450, height=185).pack()
self.init_widgets()
var = tkinter.StringVar(root)
script = var.get()
choices = ['option1', 'option2', 'option3']
option = tkinter.OptionMenu(root, var, *choices)
option.pack(side='right', padx=10, pady=10)
def init_widgets(self):
ttk.Button(self.root, command=self.insert_txt, text='Button', width='10').place(x=10, y=10)
self.txt = tkinter.Text(self.root, width='45', height='5')
self.txt.place(x=10, y=50)
def insert_txt(self):
var = tkinter.StringVar(root)
name = var.get()
line = random.choice(open(str(name)).readlines())
self.txt.insert(tkinter.INSERT, line)
if __name__ == '__main__':
root = tkinter.Tk()
Application(root)
root.mainloop()
That's because you're just creating an empty StringVar that isn't modified later, thus returning an empty string.
The OptionMenu takes the command parameter that calls the specified method every time another option is selected. Now, you can call a method like this, replacing you insert_txt:
def __init__(self):
# ...
self.var = tkinter.StringVar()
self.options = tkinter.OptionMenu(root, var, *choices, command=self.option_selected)
# ...
def option_selected(self, event):
name = self.var.get()
# The stuff you already had
Additionally, you have to empty the Text widget, otherwise the previous text would stay. I think the Entry widget is better for that, too.