0

Being not fimilar with js or coffeescript, I intended to use casperjs to download pages and to use python to parse it. But I found the pages I downloaded didn't like the ones I saw in the broswer-actually some parts of them hadn't been loaded before the pages were saved. I guess the reason may be that onload callbacks haven't been executed. What should I do, if I want to download pages same as the ones I see in the browser? Thanks very much!

My code (coffeescript):

urls =
  'jd' : 'http://list.jd.com/652-654-831-0-0-0-0-0-0-0-1-1-1-1-1-72-4137-33.html'

casper = require("casper").create()

process = (urls) ->
  casper.start "", ->
    @echo "begin to work"
  for name, url of urls
    casper.thenOpen url, ->
      @echo @download url, "#{name}.html"

process(urls)

casper.run()
Gao Hao
  • 244
  • 2
  • 12
  • Maybe you can skip the intermediate step and just screen scrape with Python: http://stackoverflow.com/questions/5272338/is-there-a-python-library-that-allows-you-to-screen-scrape-a-web-site-that-relie – jcollum Nov 07 '13 at 16:57
  • 1
    Thank you. It seems that I've found the reason: the browser or casperjs just don't know when the page is completely loaded. In the method download() casperjs will save the page when it has download all the html text and just don't care about the execution of js. – Gao Hao Nov 08 '13 at 06:10

1 Answers1

2

As you have seen casper.download() actually downloads the file. Since you want the current page source, you can use casper.getHTML(). To actually write the page contents string to a file you can use the file system module that PhantomJS provides. It has a fs.write() function.

Putting it all together, it would look like this in JavaScript:

var fs = require("fs");
casper.start();
for(name in urls){
    casper.thenOpen(name, function(){
        this.echo("download " + name);
        fs.write(name+".html", this.getHTML(), "w");
    });
}
casper.run();

or like this in CoffeeScript:

casper = require("casper").create()
fs = require("fs")

casper.start "", ->
    @echo "begin to work"
for name, url of urls
    casper.thenOpen url, ->
        @echo "download " + name
        fs.write "#{name}.html", @getHTML(), "w"

casper.run()
Artjom B.
  • 59,901
  • 24
  • 121
  • 211