如何根据string值过滤URL匹配

目前我使用下面的代码提取了13,000个URL。 但是,其中有3000人提供了来自Facebook,彭博等的url。 对于这些url,我一直在手动search那里的名字,可能有20个人中有一个公司的url。 所以我的问题是这样的:有没有一种方法,我可以编辑macros,以便如果一个URL页面包含一个string值,如“脸谱”或“维基”,它会跳过该url,并继续search一个URL不包含string值?

我如何提取url的代码:

Sub XMLHTTP() Dim url As String, lastRow As Long Dim XMLHTTP As Object, html As Object, objResultDiv As Object, objH3 As Object, link As Object Dim start_time As Date Dim end_time As Date lastRow = Range("A" & Rows.Count).End(xlUp).Row Dim cookie As String Dim result_cookie As String start_time = Time Debug.Print "start_time:" & start_time For i = 2 To lastRow url = "https://www.google.co.in/search?q=" & Cells(i, 1) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000) Set XMLHTTP = CreateObject("MSXML2.serverXMLHTTP") XMLHTTP.Open "GET", url, False XMLHTTP.setRequestHeader "Content-Type", "text/xml" XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0" XMLHTTP.send Set html = CreateObject("htmlfile") html.body.innerHTML = XMLHTTP.ResponseText Set objResultDiv = html.getelementbyid("rso") Set objH3 = objResultDiv.getelementsbytagname("H3")(0) Set link = objH3.getelementsbytagname("a")(0) str_text = Replace(link.innerHTML, "<EM>", "") str_text = Replace(str_text, "</EM>", "") Cells(i, 2) = str_text Cells(i, 3) = link.href DoEvents Next end_time = Time Debug.Print "end_time:" & end_time Debug.Print "done" & "Time taken : " & DateDiff("n", start_time, end_time) MsgBox "done" & "Time taken : " & DateDiff("n", start_time, end_time) End Sub 

这是我用来根据string值过滤URL的代码:

 Sub badURLs() Dim lr As Long ' Declare the variable lr = Cells(Rows.Count, 3).End(xlUp).Row ' Set the variable ' lr now contains the last used row in column A Application.ScreenUpdating = False For a = lr To 1 Step -1 If InStr(1, Cells(a, 3), "bloomberg", vbTextCompare) > 0 _ Or InStr(1, Cells(a, 3), "manta", vbTextCompare) > 0 _ Or InStr(1, Cells(a, 3), "yellowpages", vbTextCompare) > 0 _ Or InStr(1, Cells(a, 3), "yelp", vbTextCompare) > 0 _ Or InStr(1, Cells(a, 3), "snapshot", vbTextCompare) > 0 _ Or InStr(1, Cells(a, 3), "facebook", vbTextCompare) > 0 _ Or InStr(1, Cells(a, 3), "wiki", vbTextCompare) > 0 _ Or InStr(1, Cells(a, 3), "linkedin", vbTextCompare) > 0 _ Or InStr(1, Cells(a, 3), "hoovers", vbTextCompare) > 0 Then 'Compares for bloomberg, wiki, or hoovers. Enters loop if value is greater than 0 With Cells(a, 3) .NumberFormat = "General" .Value = "NA" End With End If Next a Application.ScreenUpdating = True End Sub 

只是想重申:我想知道是否有可能(如果有的话)如何根据第二个string中的值过滤掉第一个macros中的URL。 我希望这可以让我有更准确的url命中,我不必手动search3000公司名称,希望只有less数将有一个有用的url。

编辑演示使用:

我在下面完整地复制XMLHTTP()代码,然后在下面添加用户定义的函数来演示模块的布局。 我所做的更改实际上只影响一个: Cells(i, 3) = href 。 在这种情况下,如果href位于URLS的错误列表中,则不会在Cells(i, 3)放置任何内容。 如果您需要更复杂的业务逻辑,请告诉我们,我们将尽力提供帮助。

 Sub XMLHTTP() Dim url As String, lastRow As Long Dim XMLHTTP As Object, html As Object, objResultDiv As Object, objH3 As Object, link As Object Dim start_time As Date Dim end_time As Date lastRow = Range("A" & Rows.Count).End(xlUp).Row Dim cookie As String Dim result_cookie As String start_time = Time Debug.Print "start_time:" & start_time For i = 2 To lastRow url = "https://www.google.co.in/search?q=" & Cells(i, 1) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000) Set XMLHTTP = CreateObject("MSXML2.serverXMLHTTP") XMLHTTP.Open "GET", url, False XMLHTTP.setRequestHeader "Content-Type", "text/xml" XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0" XMLHTTP.send Set html = CreateObject("htmlfile") html.body.innerHTML = XMLHTTP.ResponseText Set objResultDiv = html.getelementbyid("rso") Set objH3 = objResultDiv.getelementsbytagname("H3")(0) Set link = objH3.getelementsbytagname("a")(0) str_text = Replace(link.innerHTML, "<EM>", "") str_text = Replace(str_text, "</EM>", "") Cells(i, 2) = str_text If funcBadUrls(Cells(i, 1)) then Cells(i, 3) = "" Else Cells(i, 3) = link.href End If DoEvents Next end_time = Time Debug.Print "end_time:" & end_time Debug.Print "done" & "Time taken : " & DateDiff("n", start_time, end_time) MsgBox "done" & "Time taken : " & DateDiff("n", start_time, end_time) End Sub Function funcBadURLs(sInput as String) as Boolean Dim bResult as Boolean If InStr(1, sInput, "bloomberg", vbTextCompare) > 0 _ Or InStr(1, sInput, "manta", vbTextCompare) > 0 _ Or InStr(1, sInput, "yellowpages", vbTextCompare) > 0 _ Or InStr(1, sInput, "yelp", vbTextCompare) > 0 _ Or InStr(1, sInput, "snapshot", vbTextCompare) > 0 _ Or InStr(1, sInput, "facebook", vbTextCompare) > 0 _ Or InStr(1, sInput, "wiki", vbTextCompare) > 0 _ Or InStr(1, sInput, "linkedin", vbTextCompare) > 0 _ Or InStr(1, sInput, "hoovers", vbTextCompare) > 0 Then bResult = True Else bResult = False End If funcBadUrls = bResult End Sub 

如果我理解正确,你想在第一个子程序中忽略BadUrls 。 如果是这样,考虑创build一个基于第二个例程的Function ,如果不好就返回true,否则返回false。 然后你可以根据需要build立逻辑。 例如:

 Function funcBadURLs(sInput as String) as Boolean Dim bResult as Boolean If InStr(1, sInput, "bloomberg", vbTextCompare) > 0 _ Or InStr(1, sInput, "manta", vbTextCompare) > 0 _ Or InStr(1, sInput, "yellowpages", vbTextCompare) > 0 _ Or InStr(1, sInput, "yelp", vbTextCompare) > 0 _ Or InStr(1, sInput, "snapshot", vbTextCompare) > 0 _ Or InStr(1, sInput, "facebook", vbTextCompare) > 0 _ Or InStr(1, sInput, "wiki", vbTextCompare) > 0 _ Or InStr(1, sInput, "linkedin", vbTextCompare) > 0 _ Or InStr(1, sInput, "hoovers", vbTextCompare) > 0 Then bResult = True Else bResult = False End If funcBadUrls = bResult End Sub 

要使用它:

 Sub Test() If funcBadUrls("www.bloomberg.com") then 'Do whatever to skip Else MsgBox "Success" End If End Sub 

让我知道这是否有帮助,或者如果我误解了你的问题。