My application is a web scraper (for the most part) that stores information in a database. I have 2 classes so far:
clsSpyder - This essentially rolls-up the scraper processes
clsDB - This does any database processes
My test program looks over all the URLs, scrapes, pushes into the database. It is pretty simple sequentially, but I would like to have say N number of threads running those processes (scrape and store). My sequential code is this:
Private Sub Button4_Click(sender As Object, e As EventArgs) Handles Button4.Click
'Grab List
Dim tDS As New DataSet
Dim tDB As New clsTermsDB
Dim tSpyder As New clsAGDSpyder
Dim sResult As New TermsRuns
'Grab a list of all URLS
tDS = tDB.GetTermsList(1)
Try
For Each Row As DataRow In tDS.Tables(0).Rows
rtbList.AppendText(Row("url_toBeCollected") & vbCrLf)
sResult = tSpyder.SpiderPage(Row("url_toBeCollected"))
'If nothing is found, do not store
If sResult.html <> "" And sResult.text <> "" Then
tDB.InsertScrape(Now(), sResult.html, sResult.text, Row("url_uid"), 1)
End If
Next
Exit Sub
Catch ex As Exception
MessageBox.Show(ex.Message)
End Try
End Sub
With that in mind and noting that I am passing variables to the SpiderPage and InsertScrape methods.. How could I implement threading? It's gotta be simple, but I feel like I have been googling and trying things for days without success :(
*** ADDED: SpiderPage method:
Public Function SpiderPage(PageURL As String) As TermsRuns
Dim webget As New HtmlWeb
Dim node As HtmlNode
Dim doc As New HtmlDocument
Dim docNOHTML As HtmlDocument
Dim uri As New Uri(PageURL)
Dim wc As HttpWebRequest = DirectCast(WebRequest.Create(uri.AbsoluteUri), HttpWebRequest)
Dim wcStream As Stream
wc.AllowAutoRedirect = True
wc.MaximumAutomaticRedirections = 3
'Set Headers
wc.UserAgent = "Mozilla/5.0 (Macintosh; I; Intel Mac OS X 11_7_9; de-LI; rv:1.9b4) Gecko/2012010317 Firefox/10.0a4"
wc.Headers.Add("REMOTE_ADDR", "66.83.101.5")
wc.Headers.Add("HTTP_REFERER", "66.83.101.5")
'Set HTMLAgility Kit Useragent Spoofing (not needed, I don't think)
webget.UserAgent = "Mozilla/5.0 (Macintosh; I; Intel Mac OS X 11_7_9; de-LI; rv:1.9b4) Gecko/2012010317 Firefox/10.0a4"
'Certification STuff
wc.UseDefaultCredentials = True
wc.Proxy.Credentials = System.Net.CredentialCache.DefaultCredentials
ServicePointManager.ServerCertificateValidationCallback = AddressOf AcceptAllCertifications
'Create Cookie Jar
Dim CookieJar As New CookieContainer
wc.CookieContainer = CookieJar
'Keep Alive Settings
wc.KeepAlive = True
wc.Timeout = &H7530
'Read the web page
Dim wr As HttpWebResponse = Nothing
Try
wcStream = wc.GetResponse.GetResponseStream
doc.Load(wcStream)
'Remove HTML from the document
docNOHTML = RemoveUnWantedTags(doc)
'Grab only the content inside the <body> tag
node = docNOHTML.DocumentNode.SelectSingleNode("//body")
'Output
SpiderPage = New TermsRuns
SpiderPage.html = node.InnerHtml
SpiderPage.text = node.InnerText
Return SpiderPage
Catch ex As Exception
'Something goes here when scraping returns an error
SpiderPage = New TermsRuns
SpiderPage.html = ""
SpiderPage.text = ""
End Try
End Function
*** Added InsertScrape:
Public Function InsertScrape(scrape_ts As DateTime, scrape_html As String, scrape_text As String, url_id As Integer, tas_id As Integer) As Boolean
Dim myCommand As MySqlClient.MySqlCommand
Dim dt As New DataTable
'Create ds/dt for fill
Dim ds As New DataSet
Dim dtbl As New DataTable
Try
'Set Connection String
myConn.ConnectionString = myConnectionString
'Push Command to Client Object
myCommand = New MySqlClient.MySqlCommand
myCommand.Connection = myConn
myCommand.CommandText = "spInsertScrape"
myCommand.CommandType = CommandType.StoredProcedure
myCommand.Parameters.AddWithValue("#scrape_ts", scrape_ts)
myCommand.Parameters("#scrape_ts").Direction = ParameterDirection.Input
myCommand.Parameters.AddWithValue("#scrape_html", scrape_html)
myCommand.Parameters("#scrape_html").Direction = ParameterDirection.Input
myCommand.Parameters.AddWithValue("#scrape_text", scrape_text)
myCommand.Parameters("#scrape_text").Direction = ParameterDirection.Input
myCommand.Parameters.AddWithValue("#url_id", url_id)
myCommand.Parameters("#url_id").Direction = ParameterDirection.Input
myCommand.Parameters.AddWithValue("#tas_id", tas_id)
myCommand.Parameters("#tas_id").Direction = ParameterDirection.Input
'Open Connection
myConn.Open()
myCommand.ExecuteNonQuery()
'Close Connection
myConn.Close()
InsertScrape = True
Catch ex As Exception
'Put Message Here
InsertScrape = False
MessageBox.Show(ex.Message)
End Try
End Function
thanks in advance.
Related
I'm trying to zip a set of pdf files and send to client as download.
No matter what combinations of Response settings I try, the code doesn't throw any exception and apparently the zip file stream is created fine, but the file is not sent to the client as a download and when you hit the download button nothing happens.
Private Sub lkbDownloadPdfs_Click(sender As Object, e As System.EventArgs) Handles aDownloadPdfs.ServerClick
Try
Dim WSStockToolAuthTokenUrl As String = ConfigurationManager.AppSettings("WSStockToolAuthTokenUrl")
Dim auth As AuthenticationHeader = Utility.GetAuthenticationForStockToolToken()
Dim token As String = Utility.GetInitializationToken(WSStockToolAuthTokenUrl, auth.UserName, auth.password)
Response.Clear()
Response.ContentType = "application/zip"
Response.AppendHeader("Content-Disposition", "attachment; filename=files.zip")
If (token IsNot Nothing) Then
Dim result As String = PDFApiCallResult(token)
Dim pdfPathList As List(Of String) = Utility.GeneratePDFList(result)
If (pdfPathList.Count = 1) Then
Dim pdfPath As String = pdfPathList.ElementAt(0)
Dim strFile As String
Dim strmZipOutputStream = New ZipOutputStream(Response.OutputStream)
strmZipOutputStream.SetLevel(9)
Dim objCrc32 As New Crc32()
For Each strFile In pdfPathList
Dim Client As WebClient = New WebClient()
Dim strmFile As Stream = Client.OpenRead(strFile)
Dim reader As StreamReader = New StreamReader(strmFile)
Dim Content As String = reader.ReadToEnd()
Dim abyBuffer(Convert.ToInt32(Content.Length - 1)) As Byte
Dim sFile As String = Path.GetFileName(strFile)
Dim theEntry As ZipEntry = New ZipEntry(sFile)
theEntry.DateTime = DateTime.Now
theEntry.Size = Content.Length
strmFile.Close()
objCrc32.Reset()
objCrc32.Update(abyBuffer)
theEntry.Crc = objCrc32.Value
strmZipOutputStream.PutNextEntry(theEntry)
strmZipOutputStream.Write(abyBuffer, 0, abyBuffer.Length)
Next
strmZipOutputStream.Flush()
strmZipOutputStream.Finish()
strmZipOutputStream.Close()
Response.Flush()
Response.Close()
Response.SuppressContent = True
HttpContext.Current.ApplicationInstance.CompleteRequest()
Else
End If
End If
Catch ex As Exception
ExceptionManager.Publish(ex)
End Try
End Sub
Any help? (If you have working C# code I could try to convert it to vb.net too)
Update 1: This is the aspx where the link which does the callback resides:
<asp:UpdatePanel runat="server" ID="updImages" UpdateMode="Conditional">
...
DOWNLOAD PDFS
...
<asp:AsyncPostBackTrigger ControlID="lkbAddToWishlist" />
<asp:AsyncPostBackTrigger ControlID="ddlCustomizations" />
</asp:UpdatePanel>
I've read that maybe the response is not working at all because of the ajax way the update panel does the postback, but not sure about that and how to deal with that.
I am loading the CSV file to vb.net datatable .
then I am using Bulk copy command to write to database.
The problem is if one date cell in the data table is empty, i am receiving
ORA:-01840 input value not long enough for date format.
How to resolve this.
Public Class Form1
Dim cn As New OracleConnection("Data Source =(DESCRIPTION =(ADDRESS = (PROTOCOL = TCP)(HOST = ipaddressofserver)(PORT = portofserver))(CONNECT_DATA =(SERVER = DEDICATED)(SERVICE_NAME = oracleservicename)) ) ; User Id=useridofdb;Password=passwordofdb")
cn.Open()
MsgBox("connection opened")
Dim dt As DataTable = ReadCSV("D:\test.csv")
DataGridView1.DataSource = dt
Try
Dim _bulkCopy As New OracleBulkCopy(cn)
_bulkCopy.DestinationTableName = "TEST_TABLE"
_bulkCopy.BulkCopyTimeout = 10800
_bulkCopy.WriteToServer(dt)
cn.Close()
cn.Dispose()
cn = Nothing
MsgBox("Finished")
Catch ex As Exception
MsgBox(ex.Message)
End Try
End Sub
Function ReadCSV(ByVal path As String) As System.Data.DataTable
Try
Dim sr As New StreamReader(path)
Dim fullFileStr As String = sr.ReadToEnd()
sr.Close()
sr.Dispose()
Dim lines As String() = fullFileStr.Split(ControlChars.Lf)
Dim recs As New DataTable()
Dim sArr As String() = lines(0).Split(","c)
For Each s As String In sArr
recs.Columns.Add(New DataColumn())
Next
Dim row As DataRow
Dim finalLine As String = ""
For Each line As String In lines
row = recs.NewRow()
finalLine = line.Replace(Convert.ToString(ControlChars.Cr), "")
row.ItemArray = finalLine.Split(","c)
recs.Rows.Add(row)
Next
Return recs
Catch ex As Exception
Throw ex
End Try
End Function
End Class
I have a program that runs in the background looping to check if a page on the site has been changed. It works once and shows the message box but if I change it again it won't do anything.
Imports System.Net
Imports System.String
Imports System.IO
Module Main
Sub Main()
While 1 = 1
Dim client As WebClient = New WebClient()
Dim reply As String = client.DownloadString("http://noahcristinotesting.dx.am/file.txt")
If reply.Contains("MsgBox") Then
Dim Array() As String = reply.Split(":")
MessageBox.Show(Array(2), Array(1))
Dim request As System.Net.FtpWebRequest = DirectCast(System.Net.WebRequest.Create("ftp://noahcristinotesting.dx.am/noahcristinotesting.dx.am/file.txt"), System.Net.FtpWebRequest)
request.Credentials = New System.Net.NetworkCredential("username", "password")
request.Method = System.Net.WebRequestMethods.Ftp.UploadFile
Dim path As String = "C:\test.txt"
Dim createText As String = "completed"
File.WriteAllText(path, createText)
Dim fileftp() As Byte = System.IO.File.ReadAllBytes("C:\test.txt")
Dim strz As System.IO.Stream = request.GetRequestStream()
strz.Write(fileftp, 0, fileftp.Length)
strz.Close()
strz.Dispose()
End If
End While
End Sub
End Module
Not sure at this moment what is causing it to crash when run outside of the IDE, but try trapping exceptions that are being thrown in the loop. I imagine there's an exception happening, cratering your app. The below catch block is by no means production ready, normally you want to catch specific exceptions in order to handle them effectively, but this is a cheap way to see if an exception is being thrown and what it is at runtime.
Sub Main()
Try
While 1 = 1
Dim client As WebClient = New WebClient()
Dim reply As String = client.DownloadString("http://noahcristinotesting.dx.am/file.txt")
If reply.Contains("MsgBox") Then
Dim Array() As String = reply.Split(":")
MessageBox.Show(Array(2), Array(1))
Dim request As System.Net.FtpWebRequest = DirectCast(System.Net.WebRequest.Create("ftp://noahcristinotesting.dx.am/noahcristinotesting.dx.am/file.txt"), System.Net.FtpWebRequest)
request.Credentials = New System.Net.NetworkCredential("username", "password")
request.Method = System.Net.WebRequestMethods.Ftp.UploadFile
Dim path As String = "C:\test.txt"
Dim createText As String = "completed"
File.WriteAllText(path, createText)
Dim fileftp() As Byte = System.IO.File.ReadAllBytes("C:\test.txt")
Dim strz As System.IO.Stream = request.GetRequestStream()
strz.Write(fileftp, 0, fileftp.Length)
strz.Close()
strz.Dispose()
End If
End While
Catch ex As Exception
MsgBox(ex.ToString)
End Try
End Sub
Alternatively, you could check your event viewer in windows to see if a .net application exception is being logged. Event Viewer > Windows Logs > Application
I am trying to page through BigQuery data with vb.net. I keep getting the same first page of data with my code. The way I understand, I need to set the pagetoken of the response to look at the next page.
With the following code, I only get the first page of data while never exiting my loop.
For the login I was setting the Oauthtoken of my queryrequest and that was getting the first page fine, but no attempt is made to page through the results that way.
I appreciate any lead in the right direction.
Dim DT As New DataTable
Dim ErrMessage As String = ""
Try
Dim INIT As New BigqueryService.Initializer
Dim scopes As IList(Of String) = New List(Of String)()
scopes.Add(BigqueryService.Scope.Bigquery)
Dim credential As UserCredential
Using stream As New FileStream("client_secrets.json", FileMode.Open, FileAccess.Read)
credential = GoogleWebAuthorizationBroker.AuthorizeAsync(GoogleClientSecrets.Load(stream).Secrets, scopes, "user", CancellationToken.None, New FileDataStore("BQ.App")).Result
End Using
INIT.HttpClientInitializer = credential
Dim service As New BigqueryService(INIT)
Dim j As JobsResource = service.Jobs
Dim req As New QueryRequest
req.Query = tQuery.Text
Dim QRequest As JobsResource.QueryRequest = j.Query(req, projectId)
QRequest.OauthToken = MyAccessToken
Dim JOBID As String = QRequest.Execute.JobReference.JobId
Dim DATA = QRequest.Execute
Dim schema = DATA.Schema
For Each col In schema.Fields
DT.Columns.Add(col.Name)
Next
Dim page_Tok = ""
Dim rr As GetQueryResultsResponse
While True
Try
rr.PageToken = page_Tok
Catch ex As Exception
'No Token Yet
End Try
rr = j.GetQueryResults(projectId, JOBID).Execute
page_Tok = rr.PageToken
If rr.JobComplete = True Then
If page_Tok = "" Then
Exit While
End If
End If
Dim resp2 = rr.Rows
For Each row In resp2
Dim DR As DataRow = DT.NewRow
For f = 0 To row.F.Count - 1
Dim field = row.F
DR(f) = row.F(f).V
Next
DT.Rows.Add(DR)
Next
End While
Catch ex As Exception
ErrMessage = ex.Message
End Try
I'm not a VB expert, but you're not setting the page token in the GetQueryResultsRequest, you're setting it in the response. See https://developers.google.com/resources/api-libraries/documentation/bigquery/v2/csharp/latest/classGoogle_1_1Apis_1_1Bigquery_1_1v2_1_1JobsResource_1_1GetQueryResultsRequest.html
I think that this will work:
req = j.GetQueryResults(projectId, JOBID)
req.PageToken = page_tok
rr = req.Execute
I'm using vb.net 2005, I've got the following code running a thread to download a file. However, the process fails sometimes when trying to read the local copy of the file. I think I may need to unlock the local file somehow but I'm not sure how to do this. Can someone take a look and advise me ?
Dim BP1Ended As Boolean = False
Private Sub BackgroundProcess1()
BP1Ended = False
mPadFileStatus = DownloadFile(mstrPadUrl, mLocalFile)
BP1Ended = True
End Sub
'---'
Dim t As System.Threading.Thread
t = New System.Threading.Thread(AddressOf BackgroundProcess1)
t.Start()
Dim ProcessStartTime As Date = Now()
Do While ProcessStartTime.AddMinutes(1) >= Date.Now
Application.DoEvents()
If BP1Ended = True Then
Exit Do
End If
Loop
t.Abort()
PadFileStatus = mPadFileStatus
If BP1Ended = False Then
Application.DoEvents()
AddConsoleMsg("Downloading file.... Aborted", True)
End If
'---'
Public Function DownloadFile(ByVal pstrRequestedFile As String, ByVal pstrDestinationFile As String, Optional ByVal TimeOut As Integer = 120) As DownloadStatus
Dim input As IO.Stream
Dim Req As System.Net.HttpWebRequest = Nothing
Dim Response As System.Net.HttpWebResponse
Try
Req = System.Net.HttpWebRequest.Create(pstrRequestedFile)
Catch ex As Exception
Return DownloadStatus.UnknownError
End Try
Req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)"
Req.Timeout = TimeOut * 1000 '120 * 1000 '1 second = 1 000 milliseconds
Try
Response = Req.GetResponse
input = Response.GetResponseStream
Dim streamreader As New StreamReader(input, System.Text.Encoding.GetEncoding("windows-1252")) 'System.Text.Encoding.UTF8)'
Dim s_response As String = streamreader.ReadToEnd()
streamreader.Close()
Dim filestream As New FileStream(pstrDestinationFile, FileMode.Create)
Dim streamwriter As New StreamWriter(filestream, System.Text.Encoding.GetEncoding("windows-1252")) ' System.Text.Encoding.UTF8)'
streamwriter.Write(s_response)
streamwriter.Flush()
streamwriter.Close()
Dim length As Long = 1000000 * 100
Dim pos As Long = 0
If Response.ContentLength > 0 Then
length = Response.ContentLength
End If
If length > 0 Then
Return DownloadStatus.OK
End If
input.Close()
Catch ew As System.Net.WebException
If ew.Status = WebExceptionStatus.NameResolutionFailure Or ew.Status = WebExceptionStatus.ProtocolError Then
Return DownloadStatus.FileNotFound
ElseIf ew.Status <> WebExceptionStatus.Success Then
Return DownloadStatus.UnknownError
End If
'Dim errorRespone As HttpWebResponse = CType(ew.Response, HttpWebResponse)
'If errorRespone.StatusCode = HttpStatusCode.NotFound Then '404
' Return DownloadStatus.FileNotFound
'Else
' Return DownloadStatus.UnknownError
'End If'
Catch ex As Exception 'Don't know'
Return DownloadStatus.UnknownError
End Try
End Function
'---'
Dim OpenFile As FileStream
'OpenFile = New FileStream(pstrPadFile, FileMode.Open, FileAccess.ReadWrite, FileShare.ReadWrite)
'FAILS HERE
OpenFile = New FileStream(pstrPadFile, FileMode.Open, FileAccess.Read, FileShare.Read)
You don't close the file stream in case of exception which might lead to the lock. Make sure you always dispose disposable resources by wrapping them in Using statements:
Using filestream As FileStream = New FileStream(pstrDestinationFile, FileMode.Create)
Using streamwriter As StreamWriter = New StreamWriter(filestream, Encoding.GetEncoding("windows-1252"))
streamwriter.Write(s_response)
End Using
End Using
Same stands true for the response and response streams. They should be properly disposed.
Also you may consider using the WebClient class which has methods like DownloadFile and DownloadData which could make your life much easier.