I have written a code to scrape data from a site and then it converts that data into a pandas dataframe . Cleans it up and then Sends it to AWS S2 where the data can get stored
import requests
import pandas as pd
from datetime import datetime
from datetime import date
import json
import smart_open
def save_to_s3(s3_file_location, data_item):
with smart_open.open(s3_file_location, "w") as out_file:
for data in data_item:
out_file.write(json.dumps(data))
out_file.write("\n")
print(f"Data saved to {s3_file_location}")
def pulling_corporate_announcements(event=None,context=None):
print("Started Pulling")
currentd = date.today()
s = requests.Session()
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = 'https://www.nseindia.com/'
step = s.get(url,headers=headers)
step = s.get(url)
today = datetime.now().strftime('%d-%m-%Y')
api_url = f'https://www.nseindia.com/api/corporate-announcements?index=equities&from_date={today}&to_date={today}'
resp = s.get(api_url,headers=headers).json()
resp = s.get(api_url).json()
result = pd.DataFrame(resp)
result.drop(['difference', 'dt','exchdisstime','csvName','old_new','orgid','seq_id','sm_isin','bflag','symbol','sort_date'], axis = 1, inplace = True)
result.rename(columns = {'an_dt':'DateandTime', 'attchmntFile':'Source','attchmntText':'Topic','desc':'Type','smIndustry':'Sector','sm_name':'Company Name'}, inplace = True)
result[['Date','Time']] = result.DateandTime.str.split(expand=True)
result = result[result['Type'].str.contains("Loss of Share Certificates|Copy of Newspaper Publication") == False]
result['Type'] = result['Type'].astype(str)
result['Type'].replace("Certificate under SEBI (Depositories and Participants) Regulations, 2018",'Junk' , inplace = True)
result = result[result['Type'].str.contains("Junk") == False]
result = result[result["Type"].str.contains("Trading Window") == False]
result.drop_duplicates(subset='Source', keep = 'first', inplace = True)
result['Temporary']=pd.to_datetime(result['Date']+' '+result['Time'])
result['Date']=result['Temporary'].dt.strftime('%b %d, %Y')
result['Time']=result['Temporary'].dt.strftime('%R %p')
result['DateTime'] = pd.to_datetime(result['Temporary'])
result['DateTime'] = result['Temporary'].dt.strftime('%m/%d/%Y %I:%M %p')
result.drop(['DateandTime', 'Temporary'], axis = 1, inplace = True)
file_name = ( str(currentd.day) +'-'+str(currentd.month) +'-'+'CA.csv')
s3_location = "s3://corpanc/" + file_name
save_to_s3(s3_location,result)
print('Saved the CSV File')
This code works perfectly in my local windows 10 although when i uploaded it to AWS lambda its giving me this error. I tried all the ways to install numpy and then make a ZIP file and upload it. Though its not working only .I have also tried to add a numpy layer still error persists .Error message -
[ERROR] Runtime.ImportModuleError: Unable to import module 'coranc': Unable to import required dependencies:
numpy:
IMPORTANT: PLEASE READ THIS FOR ADVICE ON HOW TO SOLVE THIS ISSUE!
Importing the numpy C-extensions failed. This error can happen for
many reasons, often due to issues with your setup or how NumPy was
installed.
We have compiled some common reasons and troubleshooting tips at:
https://numpy.org/devdocs/user/troubleshooting-importerror.html
Please note and check the following:
* The Python version is: Python3.9 from "/var/lang/bin/python3.9"
* The NumPy version is: "1.22.3"
and make sure that they are the versions you expect.
Please carefully study the documentation linked above for further help.
Original error was: No module named 'numpy.core._multiarray_umath
'
Newsletter3K is a good python Library for News content extraction. It works mostly well
.I want to extract names after first "by" word in visible text. This is my code, it did not work well, somebody out there please help:
import re
from newspaper import Config
from newspaper import Article
from newspaper import ArticleException
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
USER_AGENT = 'Mozilla/5.0 (Macintosh;Intel Mac OS X 10.15; rv:78.0)Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
html1='https://saugeentimes.com/new-perspectives-a-senior-moment-food-glorious-food-part-2/'
article = Article(html1.strip(), config=config)
article.download()
article.parse()
soup = BeautifulSoup(article)
## I want to take only visible text
[s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText()
for line in visible_text:
# Capture one-or-more words after first (By or by) the initial match
match = re.search(r'By (\S+)', line)
# Did we find a match?
if match:
# Yes, process it to print
By = match.group(1)
print('By {}'.format(By))`
This is not a comprehensive answer, but it is one that you can build from. You will need to expand this code as you add additional sources. Like I stated before my Newspaper3k overview document has lots of extraction examples, so please review it thoroughly.
Regular expressions should be a last ditch effort after trying these extraction methods with newspaper3k:
article.authors
meta tags
json
soup
from newspaper import Config
from newspaper import Article
from newspaper.utils import BeautifulSoup
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
urls = ['https://saugeentimes.com/new-perspectives-a-senior-moment-food-glorious-food-part-2',
'https://www.macleans.ca/education/what-college-students-in-canada-can-expect-during-covid',
'https://www.cnn.com/2021/02/12/asia/india-glacier-raini-village-chipko-intl-hnk/index.html',
'https://www.latimes.com/california/story/2021-02-13/wildfire-santa-cruz-boulder-creek-residents-fear-water'
'-quality',
'https://foxbaltimore.com/news/local/maryland-lawmakers-move-ahead-with-first-tax-on-internet-ads-02-13-2021']
for url in urls:
try:
article = Article(url, config=config)
article.download()
article.parse()
author = article.authors
if author:
print(author)
elif not author:
soup = BeautifulSoup(article.html, 'html.parser')
author_tag = soup.find(True, {'class': ['td-post-author-name', 'byline']}).find(['a', 'span'])
if author_tag:
print(author_tag.get_text().replace('By', '').strip())
else:
print('no author found')
except AttributeError as e:
pass
I'm trying to call my TensorFlow model which is deployed on a cloud foundry server with an Python 2.7 API using TensorFlow Serving and gRPC. The model expects a 200 dim vector as input, which I hardcoded at the moment. The connection Variables are stored in a virtualenv and checked twice.
The code:
import os
from grpc.beta import implementations
import tensorflow as tf
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2
from grpc._cython import cygrpc
MODEL_NAME = str(os.getenv('MODEL_NAME', ''))
MODEL_SERVER_HOST = str(os.getenv('MODEL_SERVER_HOST', ''))
MODEL_SERVER_PORT = int(os.getenv('MODEL_SERVER_PORT', ''))
ROOT_CERT = str(os.getenv('ROOT_CERT', '')).replace('\\n', '\n')
def metadata_transformer(metadata):
additions = []
token = 'Bearer <my access token>'
additions.append(('authorization', token))
return tuple(metadata) + tuple(additions)
credentials = implementations.ssl_channel_credentials(root_certificates=ROOT_CERT)
channel = implementations.secure_channel(MODEL_SERVER_HOST, MODEL_SERVER_PORT, credentials)
stub = prediction_service_pb2.beta_create_PredictionService_stub(channel, metadata_transformer=metadata_transformer)
import numpy as np
data = np.matrix([0.06222425773739815, 0.08211926370859146, -0.060986146330833435, 0.13920938968658447, 0.10515272617340088, -0.06220443174242973, -0.05927170068025589, -0.054189786314964294, -0.0986655130982399, 0.013334010727703571, -0.05667420104146004, 0.059366412460803986, -0.03483295068144798, -0.05382293462753296, 0.02721281163394451, -0.1428503543138504, 0.029297124594449997, 0.07006879895925522, 0.06501731276512146, 0.028620243072509766, 0.07128454744815826, 0.029960375279188156, 0.0710490494966507, -0.04619687795639038, -0.03106304071843624, -0.04266272485256195, 0.004348727408796549, 0.03099834732711315, 0.09248803555965424, -0.036939311772584915, 0.00017547572497278452, 0.03521900251507759, 0.10932505130767822, -0.019729139283299446, 0.12315405160188675, 0.10092845559120178, -0.12633951008319855, -0.022320391610264778, 0.0870826318860054, -0.06696301698684692, -0.016253307461738586, -0.0413096621632576, -0.040929097682237625, 0.09338817000389099, -0.08800378441810608, 0.015543102286756039, 0.018787918612360954, 0.07351260632276535, 0.038140904158353806, 0.019255049526691437, 0.0875692293047905, -0.07542476058006287, -0.04116508364677429, 0.04507743567228317, -0.06986603885889053, -0.24688798189163208, -0.035459864884614944, 0.06200174242258072, -0.06932217627763748, 0.06320516765117645, -0.023999478667974472, -0.04712359234690666, 0.03672196343541145, -0.02999514900147915, 0.04105519875884056, 0.08891177922487259, 0.15175248682498932, -0.0021488466300070286, 0.04398706927895546, -0.04429445043206215, 0.04708605632185936, 0.043234940618276596, -0.043555982410907745, 0.017381751909852028, 0.048889972269535065, -0.016929129138588905, 0.01731136068701744, -0.04694319888949394, 0.20381565392017365, 0.009074307978153229, 0.004490611143410206, -0.08525945991277695, -0.03385556861758232, 0.017475442960858345, -0.040392760187387466, 0.14970248937606812, 0.042721331119537354, -0.1257765144109726, -0.07097769528627396, -0.10943038016557693, 0.015442096628248692, -0.06519876420497894, -0.07588690519332886, -0.07620779424905777, 0.04572996124625206, -0.058589719235897064, -0.04492143541574478, -0.01922304928302765, -0.008066931739449501, 0.04317406192421913, 0.020763304084539413, -0.025430725887417793, 0.04271349683403969, 0.07393930852413177, 0.0020402593072503805, 0.0783640518784523, 0.047386448830366135, 0.010610940866172314, 0.022059153765439987, 0.034980181604623795, -0.006882485933601856, -0.08911270648241043, -0.001243607490323484, -0.06307544559240341, -0.01352659147232771, -0.24622271955013275, 0.07930449396371841, 0.03659113869071007, -0.05077377334237099, 0.08726480603218079, -0.09274136275053024, -0.05766649544239044, -0.12269984930753708, 0.056026071310043335, -0.0048304214142262936, -0.05568183213472366, -0.08890420943498611, -0.02911136858165264, -0.0944124087691307, 0.0011820291401818395, -0.08908636122941971, -0.008728212676942348, -0.014545259065926075, -0.008866528049111366, 0.02728298306465149, -0.020994992926716805, 0.031155599281191826, 0.036098793148994446, 0.06911332905292511, -0.06691643595695496, -0.00014896543871145695, -0.007080242037773132, 0.0031992685981094837, 0.043563224375247955, 0.02550852671265602, -0.015397937037050724, 0.06041031703352928, -0.08981014788150787, -0.10881254076957703, 0.03226703032851219, -0.02039985917508602, -0.05354547128081322, -0.026514282450079918, 0.09616094827651978, -0.04160488396883011, -0.06793050467967987, -0.17060619592666626, -0.08044841140508652, 0.042605575174093246, 0.08186516910791397, 0.026051705703139305, 0.1254323273897171, 0.09807661175727844, 0.04692094400525093, 0.05536479875445366, 0.004592049401253462, 0.01953544095158577, -0.02827763929963112, 0.11051501333713531, -0.05077047273516655, -0.09987067431211472, 0.025186538696289062, -0.24119670689105988, -0.054666098207235336, 0.03561021387577057, -0.006030901800841093, 0.14740994572639465, 0.09515859931707382, 0.0628485381603241, 0.020558597519993782, -0.04458167776465416, -0.04740617796778679, 0.024550801143050194, -0.09533495455980301, 0.057229768484830856, -0.08855120837688446, 0.027864644303917885, -0.07248448580503464, 0.0647491067647934, 0.09660986065864563, 0.038834456354379654, -0.030274877324700356, -0.024261653423309326, 0.05457066744565964, -0.00860705878585577, 0.04901411384344101, 0.017157232388854027, -0.02722001262009144, 0.012187148444354534, 0.05596058815717697])
request = predict_pb2.PredictRequest()
request.model_spec.name = MODEL_NAME
request.model_spec.signature_name = 'ticketCatFeature2'
request.inputs['input'].CopyFrom(
tf.contrib.util.make_tensor_proto(data, shape=[200]))
print stub.Classify(request, 10)
I'm getting following error message when running the app:
Traceback (most recent call last):
File "app.py", line 36, in
print stub.Classify(request, 10)
File "/home/vagrant/Desktop/Masterarbeit/appDir/venv/local/lib/python2.7/site-packages/grpc/beta/_client_adaptations.py", line 309, in call
self._request_serializer, self._response_deserializer)
File "/home/vagrant/Desktop/Masterarbeit/appDir/venv/local/lib/python2.7/site-packages/grpc/beta/_client_adaptations.py", line 195, in _blocking_unary_unary
raise _abortion_error(rpc_error_call)
grpc.framework.interfaces.face.face.AbortionError: AbortionError(code=StatusCode.INTERNAL, details="Did not read entire message")
Log of grpc Debug: https://ufile.io/owk76
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from items import BackpageItem, CityvibeItem
from scrapy.shell import inspect_response
import re
import time
import sys
class MySpider(CrawlSpider):
name = 'example'
allowed_domains = ['www.example.com']
# Set last_age to decide how many pages are crawled
last_page = 10
start_urls = ['http://www.example.com/washington/?page=%s' % page for page in xrange(1,last_page)]
rules = (
#Follow all links inside <div class="cat"> and calls parse_item on each link
Rule(LinkExtractor(
restrict_xpaths=('//a[#name="listing_link"]')),
callback='parse_item'),
)
# Extract relevent text from the website into a ExampleItem
def parse_item(self, response):
item = ExampleItem()
item['title'] = response.xpath('string(//h2[#class="post-title"]/text())').extract()
item['desc'] = response.xpath('string(//div[#class="section post-body"]/text())').extract()
item['url'] = response.url
item['location'] = response.xpath('string(//div[#class="posting"]/div[2]/text())').extract()
item['posted_date'] = response.xpath('string(//div[#class="post-date"]/span/text())').extract()#.re("(?<=Posted\s*).*")
item['crawled_date'] = time.strftime("%c")
# not sure how to get the other image urls right now
item['image_urls'] = response.xpath('string(//div[#class="section post-contact-container"]/div/div/img/#src)').extract()
# I can't find this section on any pages right now
item['other_ad_urls'] = response.xpath('//a[#name="listing_link"]/#href').extract()
item['phone_number'] = "".join(response.xpath('//div[#class="post-info"]/span[contains(text(), "Phone")]/following-sibling::a/text()').extract())
item['email'] = "".join(response.xpath('//div[#class="post-info"]/span[contains(text(), "Email")]/following-sibling::a/text()').extract())
item['website'] = "".join(response.xpath('//div[#class="post-info limit"]/span[contains(text(), "Website")]/following-sibling::a/text()').extract())
item['name'] = response.xpath('//div[#class="post-name"]/text()').extract()
#uncomment for debugging
#inspect_response(response, self)
return item
# process1 = CrawlerProcess({
# 'ITEM_PIPELINES': {
# #'scrapy.contrib.pipeline.images.ImagesPipeline': 1
# 'backpage.pipelines.GeolocationPipeline': 4,
# 'backpage.pipelines.LocationExtractionPipeline': 3,
# 'backpage.pipelines.BackpagePipeline': 5
# }
# });
process1 = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process1.crawl(MySpider)
process1.start()
My spider works perfectly when I run it from the command line with
scrapy crawl example
but I will need to run multiple spiders, so I want to put them all in a script and use CrawlerProcess. When I try to run this I get the error,
AttributeError: 'CrawlerProcess' object has no attribute 'crawl'
This is scrapy version 0.24.6.
All items and pipelines are correct, because the spider works from the command line.
There is (was?) a compatibility problem between Scrapy and Scrapyd. I needed to run Scrapy 0.24 and Scrapyd 1.0.1.
Here is the issue on Github
https://github.com/scrapy/scrapyd/issues/100#issuecomment-115268880