How to connect Pymongo with ssh as Robo3T? - ssh

I am trying to connect a remote mongo server with SSH and Pymongo. I use the python package sshtunnel. It works with Robo3T, but fails with pythons
It works with Robo3T:
This is my code:
from sshtunnel import SSHTunnelForwarder
from pymongo import MongoClient
from pprint import pprint
MONGO_HOST = "localhost:27017"
MONGO_DB = "dbasename"
MONGO_USER = "username"
MONGO_PASS = "password"
server = SSHTunnelForwarder(
MONGO_HOST,
ssh_username=MONGO_USER,
ssh_password=MONGO_PASS,
remote_bind_address=('10.0.0.244', 22)
)
server.start()
client = pymongo.MongoClient('127.0.0.1', server.local_bind_port)
db = client[MONGO_DB]
The code stops at server.start() . This is the error:
'Could not establish session to SSH gateway'

This is the code that works:
from sshtunnel import SSHTunnelForwarder
from pymongo import MongoClient
from pprint import pprint
MONGO_HOST = "localhost:27017"
MONGO_DB = "dbasename"
MONGO_USER = "username"
MONGO_PASS = "password"
server = SSHTunnelForwarder(
MONGO_HOST,
ssh_username=MONGO_USER,
ssh_password=MONGO_PASS,
remote_bind_address=('localhost', 27017)
)
server.start()
client = MongoClient(host= 'localhost', port=server.local_bind_port)
db = client[MONGO_DB]

Related

Undetected chromedriver how to add proxy with username/password?

I am adding chrome options this way and it works if I use proxy ip authentication.
options = webdriver.ChromeOptions()
options.headless = True
options.add_argument('--proxy-server=92.128.165.143:3399')
driver = uc.Chrome(options=options)
However, I have a proxy with authentication in this format:
http://username:password#91.92.128.165.143:3399
If I add it like
options.add_argument('--proxy-server=http://username:password#91.92.128.165.143:3399')
it doesn't work. How could I add it with username/password? This applies only to undetected chrome driver.
I think Ive achieved it already by the help of selenium wire but it didnt work with kivy gui so for scripting you can carryon like this but if you wanna use with kivy then definitely you will get error
from ast import Try
from pathlib import Path
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.proxy import *
from seleniumwire import undetected_chromedriver as uc
from fake_useragent import UserAgent
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from pyclick import HumanClicker
import pyautogui
import time
clicked=False
hostname = "188.74.183.126"
port = "8395"
proxy_username = "wclmiemy"
proxy_password = "a9hoxl4phkzr"
chrome_options = {
'proxy': {
'http': f'http://{proxy_username}:{proxy_password}#{hostname}:{port}',
'https': f'https://{proxy_username}:{proxy_password}#{hostname}:{port}',
'no_proxy': 'localhost,127.0.0.1'
}
}
def delete_cache(driver):
driver.execute_script("window.open('')") # Create a separate tab than the main one
driver.switch_to.window(driver.window_handles[-1]) # Switch window to the second tab
driver.get('chrome://settings/clearBrowserData') # Open your chrome settings.
pyautogui.click("clear_data.png")
if __name__ == '__main__':
email = "moqaddasmehran5#gmail.com"
password = "moqaddaszaheenzaheen"
ua = UserAgent()
userAgent = ua.random
options = webdriver.ChromeOptions()
options.add_argument(f'user-agent={userAgent}')
# options.add_argument('--ignore-certificate-errors-spki-list')
# # options.add_argument('--ignore-ssl-errors')
options.add_argument("--disable-infobars")
browser = uc.Chrome(
driver_executable_path="chromedriver",
seleniumwire_options=chrome_options,
options=options,
use_subprocess=True
)
browser.maximize_window()
browser.get('https://www.youtube.com/watch?v=uPxkrGL0l7U')
```
this code is kind of messy but im in Hurry hope you will be able to modify you willa also get ssle that you need to import in chrome thats it you will defnitely get it
Use the following code to add proxy with username and password:
from selenium import webdriver
PROXY = "http://username:password#91.92.128.165.143:3399"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % PROXY)
driver = webdriver.Chrome(chrome_options=chrome_options)
edit:
I found this how to set proxy with authentication in selenium chromedriver python?

Access Denied issue in AWS Glue while performing a simple ETL task

I am facing an error wile trying run AWS Glue , i am trying to copy data from my table which was filled with the help of a crawler.
The Error is given below
An error occurred while calling o91.pyWriteDynamicFrame. Access Denied (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 6YRCFCNCKW9ZK2PF; S3 Extended Request ID: 7v/5/dEhaxjIMMxfpCEu5vT6fwzmyV0kIphicPvUDYKY23rFYN1ALn2qo/N3CcIUEhSrOGKklW4=; Proxy: null)
my script is given below :
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node AWS Glue Data Catalog
AWSGlueDataCatalog_node1647583966899 = glueContext.create_dynamic_frame.from_catalog(
database="s3-databse-mockdata",
table_name="mock_data_csv",
transformation_ctx="AWSGlueDataCatalog_node1647583966899",
)
# Script generated for node Amazon S3
AmazonS3_node1647583976365 = glueContext.getSink(
path="s3://destination-001/",
connection_type="s3",
updateBehavior="UPDATE_IN_DATABASE",
partitionKeys=[],
enableUpdateCatalog=True,
transformation_ctx="AmazonS3_node1647583976365",
)
AmazonS3_node1647583976365.setCatalogInfo(
catalogDatabase="s3-databse-mockdata", catalogTableName="dest-table"
)
AmazonS3_node1647583976365.setFormat("csv")
AmazonS3_node1647583976365.writeFrame(AWSGlueDataCatalog_node1647583966899)
job.commit()
I am unable to find what is the problem

Unable to use scraperapi with selenium

I am trying to use the scraperapi with selenium. It runs fine when I use it with python requests with the following code.
import requests
proxies = {
"http": "http://scraperapi:my_api_key#proxy-server.scraperapi.com:8001",
"https": "http://scraperapi:my_api_key#proxy-server.scraperapi.com:8001"
}
r = requests.get('http://httpbin.org/ip', proxies=proxies, verify=False)
print(r.text)
It returns the proxy IP with the above code.
But it returns my original IP when I try with following code.
from selenium import webdriver
PATH = 'C:\Program Files (x86)\chromedriver.exe'
proxy = "http://api.scraperapi.com?api_key=my_api_key&render=true"
options = webdriver.ChromeOptions()
options.add_argument(f'--proxy-server={proxy}')
driver = webdriver.Chrome(PATH, options=options)
url = 'http://httpbin.org/ip'
driver.get(url)
According to ScraperAPI's own guides, the easiest way seems to be using selenium-wire instead of plain selenium
from seleniumwire import webdriver
API_KEY = 'YOUR_API_KEY'
proxy_options = {
'proxy': {
'http': f'http://scraperapi:{API_KEY}#proxy-server.scraperapi.com:8001',
'no_proxy': 'localhost,127.0.0.1'
}
}
driver = webdriver.Chrome(seleniumwire_options=proxy_options)
driver.get("http://httpbin.org/ip")
https://www.scraperapi.com/quick-start-guides/python-selenium-scraper/

pyspark read form s3 and write to elasticsearch

I'm trying to read from s3 and write to Elasticsearch,
using jupyter install on spark master machine
I have this configuration:
import pyspark
import os
#os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"
import findspark
findspark.init()
from pyspark.sql import SparkSession
import configparser
config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))
aws_profile='DEFAULT'
access_id = config.get(aws_profile, "aws_access_key_id")
access_key = config.get(aws_profile, "aws_secret_access_key")
from pyspark import SparkContext, SparkConf
sc_conf = SparkConf()
sc_conf.setAppName("app-3-logstash")
sc_conf.setMaster('spark://172.31.25.152:7077')
sc_conf.set('spark.executor.memory', '24g')
sc_conf.set('spark.executor.cores', '8')
sc_conf.set('spark.cores.max', '32')
sc_conf.set('spark.logConf', True)
sc_conf.set('spark.packages', 'org.apache.hadoop:hadoop-aws:2.7.3')
sc_conf.set('spark.jars', '/usr/local/spark/jars/elasticsearch-hadoop-7.6.0/dist/elasticsearch-spark-20_2.11-7.6.0.jar')
sc = SparkContext(conf=sc_conf)
hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoop_conf.set("fs.s3n.awsAccessKeyId", access_id)
hadoop_conf.set("fs.s3n.awsSecretAccessKey", access_key)
using this configuration, I get access to ES and not S3
when try to read from s3 using this conf I get this error:
Py4JJavaError: An error occurred while calling
z:org.apache.spark.api.python.PythonRDD.collectAndServe. :
java.lang.RuntimeException: java.lang.ClassNotFoundException: Class
org.apache.hadoop.fs.s3native.NativeS3FileSystem not found
When disabling sc_conf.set('spark.packages'.. and sc_conf.set('spark.jars', .. and enable #os.environ['PYSPARK_SUBMIT_ARGS'] , it do get access to s3 but not to ES
What do I miss?
Thanks
Yaniv

Load session cookies with scrapy

I'm using scrapy to scrape sites that require login, but I'm no sure exactly which are the fields requires to save and load in order to keep the session.
With selenium I'm doing the following to save the cookies:
import pickle
import selenium.webdriver
driver = selenium.webdriver.Firefox()
driver.get("http://www.google.com")
pickle.dump( driver.get_cookies() , open("cookies.pkl","wb"))
And this to load them:
import pickle
import selenium.webdriver
driver = selenium.webdriver.Firefox()
driver.get("http://www.google.com")
cookies = pickle.load(open("cookies.pkl", "rb"))
for cookie in cookies:
driver.add_cookie(cookie)
And it works just fine, is it possible to do exactly this using scrapy
Sent a request using cookies:
request_with_cookies = Request(url="http://www.example.com", cookies={'currency': 'USD', 'country': 'UY'})
Get cookies from response:
cookies_from_response = response.headers[b'Cookies'].decode()