-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawler.coffee
92 lines (77 loc) · 2.48 KB
/
crawler.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
_ = require 'lodash'
_log = require('single-line-log').stdout
fs = require 'fs'
chalk = require 'chalk'
Crawler = require "simplecrawler"
moment = require 'moment'
log_path = "test/site-crawler/logs/"
config = require('./lib/parseConfig')()
mkdirp = require 'mkdirp'
#--------------------------------------------------------
# Site Crawler
#--------------------------------------------------------
SiteCrawler =
remaining: null
init: ->
@errorsObj = {}
@startTimer()
@errors = 0
@startCrawler()
startTimer: ->
@duration = 1
@timer = setInterval =>
@duration++
, 1000
startCrawler: ->
@crawler = Crawler.crawl config.url
@crawler.parseScriptTags = false
@crawler.parseHTMLComments = false
@crawler
.on "fetch404", @logError.bind(@)
.on "fetchcomplete", @progress.bind(@)
# .on "fetchredirect", @logRedirect.bind(@)
.on "complete", @finish.bind(@)
createLogFile: (cb) ->
@current_logfile = "#{log_path}#{moment().format('MMM-D-YYYY-h-mm-ss-a')}_log.txt"
fs.writeFile @current_logfile, "", (err) =>
throw err if err isnt null
cb()
logError: (item, response) ->
@errors++
status = response.statusCode
url = item.url
ref = item.referrer
@addError
url: url
ref: ref
addError: (params) ->
if @errorsObj[params.url]?
@errorsObj[params.url].refs.push params.ref
else
@errorsObj[params.url] =
refs: ["#{params.ref}"]
getRemaining: -> @crawler.queue.complete()
getTotal: -> @crawler.queue.length
progress: (finished = false)->
percent_complete = @getRemaining() / @getTotal()
progress_bar = ""
progress = parseInt(percent_complete * 30)
[1..30].map (notch) ->
if notch <= progress
progress_bar += "="
else
progress_bar += " "
progress = "Crawling #{config.url} [#{progress_bar}] #{(percent_complete * 100).toFixed(2)}%"
_log chalk.cyan progress
finish: ->
_log.clear()
clearInterval @timer
duration = "#{(@duration / 60).toFixed(2)} minutes"
console.log chalk.white.bgCyan "\n Site Crawler Completed in #{duration}"
if @errors > 0
console.log chalk.white.bgMagenta "\n #{@errors} Total Errors"
mkdirp __dirname + '/log', (err) =>
throw err if err isnt null
fs.writeFile "#{__dirname}/log/#{config.name}.json", JSON.stringify(@errorsObj), (err) ->
throw err if err isnt null
module.exports = SiteCrawler