XHTML网站扫描指导

我对VBA和HTML / XHTML非常陌生,但通过在线研究和其他精彩成员的帮助,我设法编写了一个代码来提取我想要的数据。 在XHTML中,我很难识别我想要的元素的ID,所以我认为这是我最糟糕的地方。

网站: http : //www.usbanklocations.com/banks.php?q=&ct=&ml=30&lc=

这里是我想要的代码做:拉银行名称,地址,电话号码,总存款和总资产 – 给我的Excel表中提供的银行名称和城市。

这是我的代码:

Private Declare Sub Sleep Lib "kernel32.dll" (ByVal dwMilliseconds As Long) Sub CommunityBanks() Dim IE As Object, TableResults As Object, webRow As Object, BankName As Variant, page As Long, pageTotal As Long, r As Long Dim beginTime As Date, i As Long, myvalue As Variant Set IE = CreateObject("internetexplorer.application") IE.navigate "http://www.usbanklocations.com/banks.php?name=" & Range("A2").Value & "+Bank&ml=30&lc=" & Range("B2").Value & "%2C+TX" IE.Visible = True Do While IE.Busy Or IE.readystate <> 4 '4 = READYSTATE_COMPLETE DoEvents Loop 'input bank name into form 'myvalue = InputBox("Enter City. Press okay to begin search", "Bank Search") 'Range("F3").Value = myvalue 'IE.document.getelementbyid("MainContent_txtCity").Value = "LegacyTexas" 'click find button 'IE.document.getelementbyid("MainContent_btn").Click 'Sleep 5 * 1000 IE.document.getelementbytagname("table").getelementsbyclassname("btn").Click Sleep 5 * 1000 'total pages pageTotal = IE.document.getelementbyid("lsortby").innertext page = 0 Do Until page = pageTotal DoEvents page = IE.document.getelementbyclassname("lsortby").innertext With IE.document.getelementbyid("main") For r = 1 To .Rows.Length - 1 If Not IsArray(BankName) Then ReDim BankName(7, 0) As Variant Else ReDim Preserve BankName(7, UBound(BankName, 2) + 1) As Variant End If BankName(0, UBound(BankName, 2)) = .Rows(r).Cells(0).innertext Next r End With If page < pageTotal Then IE.document.getelementbyclassname("panelpn").Click beginTime = Now Application.Wait (Now + TimeValue("00:00:05")) End If Loop For r = 0 To UBound(BankName, 2) IE.navigate "http://www.usbanklocations.com/" & BankName(0, r) Do While IE.Busy Or IE.readystate <> 4 '4 = READYSTATE_COMPLETE DoEvents Loop 'wait 5 sec. for screen refresh Sleep 5 * 1000 With IE.document.getelementbytagname("table") For i = 0 To .Rows.Length - 1 DoEvents Select Case .Rows(i).Cells(0).innertext Case "Name:" BankName(1, r) = .Rows(i).Cells(1).innertext Case "Location:" BankName(2, r) = .Rows(i).Cells(1).innertext Case "Phone:" BankName(3, r) = .Rows(i).Cells(1).innertext Case "Branch Deposit:" BankName(4, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "") Case "Total Assets:" BankName(5, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "") End Select Next i End With Next r IE.Quit Set IE = Nothing 'post result on Excel cell Worksheets(1).Range("A9").Resize(UBound(BankName, 2) + 1, UBound(BankName, 1) + 1).Value = Application.Transpose(BankName) End Sub 

先谢谢你! 我将不胜感激任何帮助。

考虑下面的例子,它使用XHR而不是IE和基于分割的HTML内容分析:

 Option Explicit Sub Test_usbanklocations() Dim oSource, oDestination, y, oSrcRow, sName, sCity, sDist, sUrl0, sUrl1, sUrl2, lPage, sResp1, sResp2, i, a1, a2, a3, a4, a5 Set oSource = Sheets(1) Set oDestination = Sheets(2) oDestination.Cells.Delete DataOutput oDestination, 1, Array("Name", "Location", "Phone", "Total Assets", "Total Deposits") y = 2 For Each oSrcRow In oSource.UsedRange.Rows sName = oSrcRow.Cells(1, 1).Value sCity = oSrcRow.Cells(1, 2).Value sDist = oSrcRow.Cells(1, 3).Value sUrl0 = "http://www.usbanklocations.com/banks.php?q=" & EncodeUriComponent(sName) & "&lc=" & EncodeUriComponent(sCity) & "&ml=" & sDist sUrl1 = sUrl0 lPage = 1 Do sResp1 = GetXHR(sUrl1) If InStr(sResp1, "We can not find the address you provided. Please check.") > 0 Then Exit Do a1 = Split(sResp1, "<div class=""pl") For i = 1 To UBound(a1) a2 = Split(a1(i), "</div>", 3) a3 = Split(a2(1), "<a href=""", 2) a4 = Split(a3(1), """>", 2) sUrl2 = "http://www.usbanklocations.com" & a4(0) sResp2 = GetXHR(sUrl2) a5 = Array( _ GetFragment(sResp2, "<b>Name:</b></td><td>", "</td>"), _ Replace(GetFragment(sResp2, "<b>Location:</b></td><td>", "</td>"), "View Other Branches", ""), _ GetFragment(sResp2, "<b>Phone:</b></td>", "</td>"), _ GetFragment(sResp2, "<b>Total Assets:</b></td><td>", "</td>"), _ GetFragment(sResp2, "<b>Total Deposits:</b></td><td>", "</td>") _ ) DataOutput oDestination, y, a5 y = y + 1 DoEvents Next If InStr(sResp1, "Next Page &gt;") = 0 Then Exit Do lPage = lPage + 1 sUrl1 = sUrl0 & "&ps=" & lPage DoEvents Loop Next MsgBox "Completed" End Sub Function GetXHR(sUrl) With CreateObject("MSXML2.XMLHTTP") .Open "GET", sUrl, False .Send GetXHR = .ResponseText End With End Function Sub DataOutput(oSht, y, aValues) With oSht.Cells(y, 1).Resize(1, UBound(aValues) + 1) .NumberFormat = "@" .Value = aValues End With End Sub Function GetFragment(sText, sPatt1, sPatt2) Dim a1, a2 a1 = Split(sText, sPatt1, 2) If UBound(a1) <> 1 Then Exit Function a2 = Split(a1(1), sPatt2, 2) If UBound(a2) <> 1 Then Exit Function GetFragment = GetInnerText(a2(0)) End Function Function EncodeUriComponent(sText) Static objHtmlfile As Object If objHtmlfile Is Nothing Then Set objHtmlfile = CreateObject("htmlfile") objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript" End If EncodeUriComponent = objHtmlfile.parentWindow.encode(sText) End Function Function GetInnerText(sText) With CreateObject("htmlfile") .Write ("<body>" & sText & "</body>") GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText End With End Function 

例如,第一个工作表包含要search的数据(银行名称,位置和距离以进行优化):

资源

然后在第二个工作表上的结果如下:

结果