有你在真好 的个人博客
零久网站采集器:模拟人工操作,可以抓取任何网站,您值得拥有
阅读:2237 添加日期:2021/3/27 23:23:37 原文链接:https://www.toutiao.com/item/6470066417122148877/

零久网站采集器:模拟人工操作,可以抓取任何网站,您值得拥有

目前市面上的网页采集器,要么收费,要么有病毒,要么就根本无法抓取复杂的网站,现在我把我公司的采集器源码分享给大家!

编写语言:Microsoft Visual Basic 6.0 中文版

允许环境:win7

采集原理:模拟人工,调用webbrowser 的api进行采集

好处:可以采集任何网站!

基本操作:

1、开启调试

Private Sub Command1_Click()

On Error Resume Next

修改成

Private Sub Command1_Click()

'On Error Resume Next

2、图片保存路径

主程序下的pic文件夹

3、简繁体转换

StoT 简转繁

TtoS 繁转简

4、入口url文件url.txt

5、需要修改的文件:add.html

6、采集原理

WB加载页面,开始采集 wb2 提交到服务器上面

7、数据处理

Function StringWithoutBrackets(ByVal s As String) As String

Dim mc As MatchCollection, ma As Match, sV, sA, S_TR As String, nS_TR As String

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<style[^>]*?>[\s\S]*?<\/style>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(s, "")

End With

If adel.value = 1 Then

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<a[^>]*?>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "<")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<\/a>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, ">")

End With

End If

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<\/p>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<p>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<div[^>]*?none[^>]*?>[\s\S]*?<\/div>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<!--[\s\S]*?-->"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<script[^>]*?>[\s\S]*?<\/script>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "[\n|\r]+"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<p>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<pre>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<br>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<\/li>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "[@]")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<strong>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "[#]")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<\/strong>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "[$]")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<h2>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "[#]")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<\/h2>"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "[$]")

End With

If Check2.value = 1 Then '敏感字检测

StringWithoutBrackets = jc_Key(StringWithoutBrackets)

End If

If upimg.value = 0 Then '删除所有图片?

Set mc = RegExpTest("<[^>]+src=""([^""]+)""", StringWithoutBrackets) '通过正则表示式提取图片,务必注意大小写。

If mc.Count > 0 Then

For Each ma In mc

DoEvents '程序优化

'MsgBox (ma.Value)

S_TR = ma.SubMatches(0)

If InStr(1, S_TR, "end_news.png") Or InStr(1, S_TR, "/r?") Then

Else

'StringWithoutBrackets = Replace(StringWithoutBrackets, S_TR, ">[img]" & S_TR & "[/img]<")

End If

Next

End If

Else

Set mc = RegExpTest("<[^>]+src=""([^""]+)""", StringWithoutBrackets) '通过正则表示式提取图片,务必注意大小写。

If mc.Count > 0 Then

For Each ma In mc

DoEvents '程序优化

'MsgBox (ma.Value)

S_TR = ma.SubMatches(0)

If (Left(S_TR, 1) = "/") Then

nS_TR = "http://" + WB.document.domain + S_TR

Else

If (Left(S_TR, 2) <> "ht") Then

nS_TR = Mid(WB.LocationURL, 1, InStrRev(WB.LocationURL, "/")) & S_TR

Else

nS_TR = S_TR

End If

End If

des = GetType(S_TR)

FileName = App.Path & "\ypic\" & Int(Rnd * 530000) & upnum & "." & des

If URLDownloadToFile(0, nS_TR, FileName, 0, 0) <> 0 Then

'Call DownNetFile(downurl, FileName, "")

End If

If Dir(FileName) = "" Then

Else

fileSize = FileLen(FileName)

If fileSize < 100 Then '已经存在了抛弃大文件

Text1.Text = "文件少于2K抛弃:" & FileName

Kill FileName

GoTo sup

End If

dirtxt = LCase(DigestFileToHexStr(FileName))

If Dir(App.Path & "\pic\" & Left(dirtxt, 3), vbDirectory) = "" Then

MkDir App.Path & "\pic\" & Left(dirtxt, 3)

End If

If Dir(App.Path & "\pic\" & dirtxt & "." & des, vbDirectory) = "" Then

Name FileName As App.Path & "\pic\" & dirtxt & "." & des

Else

Text1.Text = "重复:"

Kill FileName

upnum = upnum + 1

Form1.Caption = "重复" & upnum

GoTo sup

End If

titles = wztitle

upimage = "https://i.ysv8.com/upload/" & Replace(dirtxt, "\", "/") & "." & des

upnum = upnum + 1

Form1.Caption = upnum

sup:

StringWithoutBrackets = Replace(StringWithoutBrackets, S_TR, ">[img]" & "https://i.ysv8.com/upload/" & Replace(dirtxt, "\", "/") & "." & des & "[/img]<")

End If

Next

End If

End If

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "[<][^>]*[>]"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "")

End With

StringWithoutBrackets = Replace(StringWithoutBrackets, "[@]", "<br>")

StringWithoutBrackets = Replace(StringWithoutBrackets, "[#]", "<strong>")

StringWithoutBrackets = Replace(StringWithoutBrackets, "[$]", "</strong>")

StringWithoutBrackets = Replace(StringWithoutBrackets, "指导意见", vbLf + "指导意见")

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "<br>[ ]?"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "<br>")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "(<br>)+"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "<br>")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "(<br>)+"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, "<br>")

End With

With CreateObject("VBSCRIPT.REGEXP")

.Global = True

.Pattern = "(http\:\/\/[a-zA-z0-9.\/\-]*)+"

.IgnoreCase = True

StringWithoutBrackets = .Replace(StringWithoutBrackets, vbLf)

End With

StringWithoutBrackets = jc_th(StringWithoutBrackets)

'StringWithoutBrackets = jc_jq(StringWithoutBrackets)

groupid = 1

'MsgBox (StringWithoutBrackets)

End Function

注意:您最好是具备一定的Visual Basic编程基础,如果您不具备可以去论坛联系我,帮您配置!

(我不一定有时候帮您,您最好是自学下)

源码下载及论坛:
https://www.ysv8.com/f/129.html

零久网站采集器:模拟人工操作,可以抓取任何网站,您值得拥有

源代码免费下载!请点击【采集器免费下载】那几个红字

ICP备案号:苏ICP备14035786号-1 苏公网安备 32050502001014号