Python facebook chatbook download from google - urllib

I'm trying to program a python file that downloads google images, but gives the following error
"C:\Users\marco\Desktop\Scripts Python\venv\Scripts\python.exe" "C:/Users/marco/Desktop/Scripts Python/ChatBot.py"
Traceback (most recent call last):
File "C:/Users/marco/Desktop/Scripts Python/ChatBot.py", line 4, in
from urllib import FancyURLopener
ImportError: cannot import name 'FancyURLopener' from 'urllib' (C:\Users\marco\AppData\Local\Programs\Python\Python37-32\lib\urllib__init__.py)
my code:
import os
import sys
import time
from urllib import FancyURLopener
import urllib2
import simplejson
# Define search term
searchTerm = "william shatner"
# Replace spaces ' ' in search term for '%20' in order to comply with request
searchTerm = searchTerm.replace(' ','%20')
# Start FancyURLopener with defined version
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
myopener = MyOpener()
# Set count to 0
count= 0
for i in range(0,10):
# Notice that the start changes for each iteration in order to request a new set of images for each loop
url = ('https://ajax.googleapis.com/ajax/services/search/images?' + 'v=1.0&q='+searchTerm+'&start='+str(i*4)+'&userip=MyIP')
print (url)
request = urllib2.Request(url, None, {'Referer': 'testing'})
response = urllib2.urlopen(request)
# Get results using JSON
results = simplejson.load(response)
data = results['responseData']
dataInfo = data['results']
# Iterate for each result and get unescaped url
for myUrl in dataInfo:
count = count + 1
print (myUrl['unescapedUrl'])
myopener.retrieve(myUrl['unescapedUrl'],str(count)+'.jpg')
# Sleep for one second to prevent IP blocking from Google
time.sleep(1)

As the error message says, FancyURLopener is not where you are looking for it. This is the correct import statement:
from urllib.request import FancyURLopener

Related

Code Working Perfectly in Local Machine although giving error in AWS Lambda

I have written a code to scrape data from a site and then it converts that data into a pandas dataframe . Cleans it up and then Sends it to AWS S2 where the data can get stored
import requests
import pandas as pd
from datetime import datetime
from datetime import date
import json
import smart_open
def save_to_s3(s3_file_location, data_item):
with smart_open.open(s3_file_location, "w") as out_file:
for data in data_item:
out_file.write(json.dumps(data))
out_file.write("\n")
print(f"Data saved to {s3_file_location}")
def pulling_corporate_announcements(event=None,context=None):
print("Started Pulling")
currentd = date.today()
s = requests.Session()
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = 'https://www.nseindia.com/'
step = s.get(url,headers=headers)
step = s.get(url)
today = datetime.now().strftime('%d-%m-%Y')
api_url = f'https://www.nseindia.com/api/corporate-announcements?index=equities&from_date={today}&to_date={today}'
resp = s.get(api_url,headers=headers).json()
resp = s.get(api_url).json()
result = pd.DataFrame(resp)
result.drop(['difference', 'dt','exchdisstime','csvName','old_new','orgid','seq_id','sm_isin','bflag','symbol','sort_date'], axis = 1, inplace = True)
result.rename(columns = {'an_dt':'DateandTime', 'attchmntFile':'Source','attchmntText':'Topic','desc':'Type','smIndustry':'Sector','sm_name':'Company Name'}, inplace = True)
result[['Date','Time']] = result.DateandTime.str.split(expand=True)
result = result[result['Type'].str.contains("Loss of Share Certificates|Copy of Newspaper Publication") == False]
result['Type'] = result['Type'].astype(str)
result['Type'].replace("Certificate under SEBI (Depositories and Participants) Regulations, 2018",'Junk' , inplace = True)
result = result[result['Type'].str.contains("Junk") == False]
result = result[result["Type"].str.contains("Trading Window") == False]
result.drop_duplicates(subset='Source', keep = 'first', inplace = True)
result['Temporary']=pd.to_datetime(result['Date']+' '+result['Time'])
result['Date']=result['Temporary'].dt.strftime('%b %d, %Y')
result['Time']=result['Temporary'].dt.strftime('%R %p')
result['DateTime'] = pd.to_datetime(result['Temporary'])
result['DateTime'] = result['Temporary'].dt.strftime('%m/%d/%Y %I:%M %p')
result.drop(['DateandTime', 'Temporary'], axis = 1, inplace = True)
file_name = ( str(currentd.day) +'-'+str(currentd.month) +'-'+'CA.csv')
s3_location = "s3://corpanc/" + file_name
save_to_s3(s3_location,result)
print('Saved the CSV File')
This code works perfectly in my local windows 10 although when i uploaded it to AWS lambda its giving me this error. I tried all the ways to install numpy and then make a ZIP file and upload it. Though its not working only .I have also tried to add a numpy layer still error persists .Error message -
[ERROR] Runtime.ImportModuleError: Unable to import module 'coranc': Unable to import required dependencies:
numpy:
IMPORTANT: PLEASE READ THIS FOR ADVICE ON HOW TO SOLVE THIS ISSUE!
Importing the numpy C-extensions failed. This error can happen for
many reasons, often due to issues with your setup or how NumPy was
installed.
We have compiled some common reasons and troubleshooting tips at:
https://numpy.org/devdocs/user/troubleshooting-importerror.html
Please note and check the following:
* The Python version is: Python3.9 from "/var/lang/bin/python3.9"
* The NumPy version is: "1.22.3"
and make sure that they are the versions you expect.
Please carefully study the documentation linked above for further help.
Original error was: No module named 'numpy.core._multiarray_umath
'

newsletter3k, find author name in visible text after first "by" word

Newsletter3K is a good python Library for News content extraction. It works mostly well
.I want to extract names after first "by" word in visible text. This is my code, it did not work well, somebody out there please help:
import re
from newspaper import Config
from newspaper import Article
from newspaper import ArticleException
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
USER_AGENT = 'Mozilla/5.0 (Macintosh;Intel Mac OS X 10.15; rv:78.0)Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
html1='https://saugeentimes.com/new-perspectives-a-senior-moment-food-glorious-food-part-2/'
article = Article(html1.strip(), config=config)
article.download()
article.parse()
soup = BeautifulSoup(article)
## I want to take only visible text
[s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText()
for line in visible_text:
# Capture one-or-more words after first (By or by) the initial match
match = re.search(r'By (\S+)', line)
# Did we find a match?
if match:
# Yes, process it to print
By = match.group(1)
print('By {}'.format(By))`
This is not a comprehensive answer, but it is one that you can build from. You will need to expand this code as you add additional sources. Like I stated before my Newspaper3k overview document has lots of extraction examples, so please review it thoroughly.
Regular expressions should be a last ditch effort after trying these extraction methods with newspaper3k:
article.authors
meta tags
json
soup
from newspaper import Config
from newspaper import Article
from newspaper.utils import BeautifulSoup
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
urls = ['https://saugeentimes.com/new-perspectives-a-senior-moment-food-glorious-food-part-2',
'https://www.macleans.ca/education/what-college-students-in-canada-can-expect-during-covid',
'https://www.cnn.com/2021/02/12/asia/india-glacier-raini-village-chipko-intl-hnk/index.html',
'https://www.latimes.com/california/story/2021-02-13/wildfire-santa-cruz-boulder-creek-residents-fear-water'
'-quality',
'https://foxbaltimore.com/news/local/maryland-lawmakers-move-ahead-with-first-tax-on-internet-ads-02-13-2021']
for url in urls:
try:
article = Article(url, config=config)
article.download()
article.parse()
author = article.authors
if author:
print(author)
elif not author:
soup = BeautifulSoup(article.html, 'html.parser')
author_tag = soup.find(True, {'class': ['td-post-author-name', 'byline']}).find(['a', 'span'])
if author_tag:
print(author_tag.get_text().replace('By', '').strip())
else:
print('no author found')
except AttributeError as e:
pass

TensorFlow Serving + gRPC "Did not read entire message"

I'm trying to call my TensorFlow model which is deployed on a cloud foundry server with an Python 2.7 API using TensorFlow Serving and gRPC. The model expects a 200 dim vector as input, which I hardcoded at the moment. The connection Variables are stored in a virtualenv and checked twice.
The code:
import os
from grpc.beta import implementations
import tensorflow as tf
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2
from grpc._cython import cygrpc
MODEL_NAME = str(os.getenv('MODEL_NAME', ''))
MODEL_SERVER_HOST = str(os.getenv('MODEL_SERVER_HOST', ''))
MODEL_SERVER_PORT = int(os.getenv('MODEL_SERVER_PORT', ''))
ROOT_CERT = str(os.getenv('ROOT_CERT', '')).replace('\\n', '\n')
def metadata_transformer(metadata):
additions = []
token = 'Bearer <my access token>'
additions.append(('authorization', token))
return tuple(metadata) + tuple(additions)
credentials = implementations.ssl_channel_credentials(root_certificates=ROOT_CERT)
channel = implementations.secure_channel(MODEL_SERVER_HOST, MODEL_SERVER_PORT, credentials)
stub = prediction_service_pb2.beta_create_PredictionService_stub(channel, metadata_transformer=metadata_transformer)
import numpy as np
data = np.matrix([0.06222425773739815, 0.08211926370859146, -0.060986146330833435, 0.13920938968658447, 0.10515272617340088, -0.06220443174242973, -0.05927170068025589, -0.054189786314964294, -0.0986655130982399, 0.013334010727703571, -0.05667420104146004, 0.059366412460803986, -0.03483295068144798, -0.05382293462753296, 0.02721281163394451, -0.1428503543138504, 0.029297124594449997, 0.07006879895925522, 0.06501731276512146, 0.028620243072509766, 0.07128454744815826, 0.029960375279188156, 0.0710490494966507, -0.04619687795639038, -0.03106304071843624, -0.04266272485256195, 0.004348727408796549, 0.03099834732711315, 0.09248803555965424, -0.036939311772584915, 0.00017547572497278452, 0.03521900251507759, 0.10932505130767822, -0.019729139283299446, 0.12315405160188675, 0.10092845559120178, -0.12633951008319855, -0.022320391610264778, 0.0870826318860054, -0.06696301698684692, -0.016253307461738586, -0.0413096621632576, -0.040929097682237625, 0.09338817000389099, -0.08800378441810608, 0.015543102286756039, 0.018787918612360954, 0.07351260632276535, 0.038140904158353806, 0.019255049526691437, 0.0875692293047905, -0.07542476058006287, -0.04116508364677429, 0.04507743567228317, -0.06986603885889053, -0.24688798189163208, -0.035459864884614944, 0.06200174242258072, -0.06932217627763748, 0.06320516765117645, -0.023999478667974472, -0.04712359234690666, 0.03672196343541145, -0.02999514900147915, 0.04105519875884056, 0.08891177922487259, 0.15175248682498932, -0.0021488466300070286, 0.04398706927895546, -0.04429445043206215, 0.04708605632185936, 0.043234940618276596, -0.043555982410907745, 0.017381751909852028, 0.048889972269535065, -0.016929129138588905, 0.01731136068701744, -0.04694319888949394, 0.20381565392017365, 0.009074307978153229, 0.004490611143410206, -0.08525945991277695, -0.03385556861758232, 0.017475442960858345, -0.040392760187387466, 0.14970248937606812, 0.042721331119537354, -0.1257765144109726, -0.07097769528627396, -0.10943038016557693, 0.015442096628248692, -0.06519876420497894, -0.07588690519332886, -0.07620779424905777, 0.04572996124625206, -0.058589719235897064, -0.04492143541574478, -0.01922304928302765, -0.008066931739449501, 0.04317406192421913, 0.020763304084539413, -0.025430725887417793, 0.04271349683403969, 0.07393930852413177, 0.0020402593072503805, 0.0783640518784523, 0.047386448830366135, 0.010610940866172314, 0.022059153765439987, 0.034980181604623795, -0.006882485933601856, -0.08911270648241043, -0.001243607490323484, -0.06307544559240341, -0.01352659147232771, -0.24622271955013275, 0.07930449396371841, 0.03659113869071007, -0.05077377334237099, 0.08726480603218079, -0.09274136275053024, -0.05766649544239044, -0.12269984930753708, 0.056026071310043335, -0.0048304214142262936, -0.05568183213472366, -0.08890420943498611, -0.02911136858165264, -0.0944124087691307, 0.0011820291401818395, -0.08908636122941971, -0.008728212676942348, -0.014545259065926075, -0.008866528049111366, 0.02728298306465149, -0.020994992926716805, 0.031155599281191826, 0.036098793148994446, 0.06911332905292511, -0.06691643595695496, -0.00014896543871145695, -0.007080242037773132, 0.0031992685981094837, 0.043563224375247955, 0.02550852671265602, -0.015397937037050724, 0.06041031703352928, -0.08981014788150787, -0.10881254076957703, 0.03226703032851219, -0.02039985917508602, -0.05354547128081322, -0.026514282450079918, 0.09616094827651978, -0.04160488396883011, -0.06793050467967987, -0.17060619592666626, -0.08044841140508652, 0.042605575174093246, 0.08186516910791397, 0.026051705703139305, 0.1254323273897171, 0.09807661175727844, 0.04692094400525093, 0.05536479875445366, 0.004592049401253462, 0.01953544095158577, -0.02827763929963112, 0.11051501333713531, -0.05077047273516655, -0.09987067431211472, 0.025186538696289062, -0.24119670689105988, -0.054666098207235336, 0.03561021387577057, -0.006030901800841093, 0.14740994572639465, 0.09515859931707382, 0.0628485381603241, 0.020558597519993782, -0.04458167776465416, -0.04740617796778679, 0.024550801143050194, -0.09533495455980301, 0.057229768484830856, -0.08855120837688446, 0.027864644303917885, -0.07248448580503464, 0.0647491067647934, 0.09660986065864563, 0.038834456354379654, -0.030274877324700356, -0.024261653423309326, 0.05457066744565964, -0.00860705878585577, 0.04901411384344101, 0.017157232388854027, -0.02722001262009144, 0.012187148444354534, 0.05596058815717697])
request = predict_pb2.PredictRequest()
request.model_spec.name = MODEL_NAME
request.model_spec.signature_name = 'ticketCatFeature2'
request.inputs['input'].CopyFrom(
tf.contrib.util.make_tensor_proto(data, shape=[200]))
print stub.Classify(request, 10)
I'm getting following error message when running the app:
Traceback (most recent call last):
File "app.py", line 36, in
print stub.Classify(request, 10)
File "/home/vagrant/Desktop/Masterarbeit/appDir/venv/local/lib/python2.7/site-packages/grpc/beta/_client_adaptations.py", line 309, in call
self._request_serializer, self._response_deserializer)
File "/home/vagrant/Desktop/Masterarbeit/appDir/venv/local/lib/python2.7/site-packages/grpc/beta/_client_adaptations.py", line 195, in _blocking_unary_unary
raise _abortion_error(rpc_error_call)
grpc.framework.interfaces.face.face.AbortionError: AbortionError(code=StatusCode.INTERNAL, details="Did not read entire message")
Log of grpc Debug: https://ufile.io/owk76

List index out of range error in splunk API call

I am trying to get the result of a search query from splunk. But when i try to get the session key, i am getting the following error.
Traceback (most recent call last):
File "splunkenter.py", line 18, in <module>
sessionkey = minidom.parseString(servercontent).getElementsByTagName('sessionKey')[0].childNodes[0].nodeValue
IndexError: list index out of range
splunkenter.py:
import urllib
import httplib2
import time
import re from time
import localtime,strftime
from xml.dom import minidom
import json
baseurl = 'abc.haihd.com:8000'
username = 'xxxxx'
password = 'xxxxx'
myhttp = httplib2.Http()
#Step 1: Get a session key
servercontent = myhttp.request(baseurl + '/services/auth/login', 'POST', headers={}, body=urllib.urlencode({'username':username, 'password':password}))[1]
sessionkey =
minidom.parseString(servercontent).getElementsByTagName('sessionKey')[0].childNo‌​des[0].nodeValue
print "====>sessionkey: %s <====" % sessionke
Can anybody tell me where is the problem lying. I am very new to API's.

Why does my CrawlerProcess not have the function "crawl"?

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from items import BackpageItem, CityvibeItem
from scrapy.shell import inspect_response
import re
import time
import sys
class MySpider(CrawlSpider):
name = 'example'
allowed_domains = ['www.example.com']
# Set last_age to decide how many pages are crawled
last_page = 10
start_urls = ['http://www.example.com/washington/?page=%s' % page for page in xrange(1,last_page)]
rules = (
#Follow all links inside <div class="cat"> and calls parse_item on each link
Rule(LinkExtractor(
restrict_xpaths=('//a[#name="listing_link"]')),
callback='parse_item'),
)
# Extract relevent text from the website into a ExampleItem
def parse_item(self, response):
item = ExampleItem()
item['title'] = response.xpath('string(//h2[#class="post-title"]/text())').extract()
item['desc'] = response.xpath('string(//div[#class="section post-body"]/text())').extract()
item['url'] = response.url
item['location'] = response.xpath('string(//div[#class="posting"]/div[2]/text())').extract()
item['posted_date'] = response.xpath('string(//div[#class="post-date"]/span/text())').extract()#.re("(?<=Posted\s*).*")
item['crawled_date'] = time.strftime("%c")
# not sure how to get the other image urls right now
item['image_urls'] = response.xpath('string(//div[#class="section post-contact-container"]/div/div/img/#src)').extract()
# I can't find this section on any pages right now
item['other_ad_urls'] = response.xpath('//a[#name="listing_link"]/#href').extract()
item['phone_number'] = "".join(response.xpath('//div[#class="post-info"]/span[contains(text(), "Phone")]/following-sibling::a/text()').extract())
item['email'] = "".join(response.xpath('//div[#class="post-info"]/span[contains(text(), "Email")]/following-sibling::a/text()').extract())
item['website'] = "".join(response.xpath('//div[#class="post-info limit"]/span[contains(text(), "Website")]/following-sibling::a/text()').extract())
item['name'] = response.xpath('//div[#class="post-name"]/text()').extract()
#uncomment for debugging
#inspect_response(response, self)
return item
# process1 = CrawlerProcess({
# 'ITEM_PIPELINES': {
# #'scrapy.contrib.pipeline.images.ImagesPipeline': 1
# 'backpage.pipelines.GeolocationPipeline': 4,
# 'backpage.pipelines.LocationExtractionPipeline': 3,
# 'backpage.pipelines.BackpagePipeline': 5
# }
# });
process1 = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process1.crawl(MySpider)
process1.start()
My spider works perfectly when I run it from the command line with
scrapy crawl example
but I will need to run multiple spiders, so I want to put them all in a script and use CrawlerProcess. When I try to run this I get the error,
AttributeError: 'CrawlerProcess' object has no attribute 'crawl'
This is scrapy version 0.24.6.
All items and pipelines are correct, because the spider works from the command line.
There is (was?) a compatibility problem between Scrapy and Scrapyd. I needed to run Scrapy 0.24 and Scrapyd 1.0.1.
Here is the issue on Github
https://github.com/scrapy/scrapyd/issues/100#issuecomment-115268880