{"_id":"cheerio-soupselect","_rev":"60465","name":"cheerio-soupselect","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","dist-tags":{"latest":"0.1.1"},"maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}],"time":{"modified":"2021-06-03T10:20:27.000Z","created":"2011-11-26T04:30:48.954Z","0.1.1":"2012-03-04T05:28:22.708Z","0.1.0":"2012-01-17T03:51:32.175Z","0.0.3":"2011-12-19T09:43:36.718Z","0.0.2":"2011-11-26T04:51:55.232Z","0.0.1":"2011-11-26T04:30:48.954Z"},"users":{"mast4461":true},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"versions":{"0.1.1":{"name":"cheerio-soupselect","version":"0.1.1","engines":{"node":">=0.2.0"},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"url":"http://github.com/harryf/node-soupselect","dependencies":{"htmlparser2":"2.x"},"contributors":[{"name":"Siddharth Mahendraker","email":"siddharth_mahen@me.com"}],"devDependencies":{"mocha":"0.x"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"main":"./lib/soupselect","scripts":{"test":"mocha -u tdd -R list"},"license":"MIT","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","_npmUser":{"name":"mattmueller","email":"mattmuelle@gmail.com"},"_id":"cheerio-soupselect@0.1.1","_engineSupported":true,"_npmVersion":"1.0.103","_nodeVersion":"v0.4.11","_defaultsLoaded":true,"dist":{"shasum":"9baa6ab69d3b8cd223167690365a336cb9ff2359","size":10240,"noattachment":false,"key":"/cheerio-soupselect/-/cheerio-soupselect-0.1.1.tgz","tarball":"http://registry.cnpm.dingdandao.com/cheerio-soupselect/download/cheerio-soupselect-0.1.1.tgz"},"maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}],"directories":{},"publish_time":1330838902708,"_cnpm_publish_time":1330838902708,"_hasShrinkwrap":false},"0.1.0":{"name":"cheerio-soupselect","version":"0.1.0","engines":{"node":">=0.2.0"},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"url":"http://github.com/harryf/node-soupselect","dependencies":{"htmlparser2":"2.x"},"contributors":[{"name":"Siddharth Mahendraker","email":"siddharth_mahen@me.com"}],"devDependencies":{"mocha":"0.x"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"main":"./lib/soupselect","scripts":{"test":"mocha -u tdd -R list"},"license":"MIT","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","_npmUser":{"name":"mattmueller","email":"mattmuelle@gmail.com"},"_id":"cheerio-soupselect@0.1.0","_engineSupported":true,"_npmVersion":"1.0.103","_nodeVersion":"v0.4.11","_defaultsLoaded":true,"dist":{"shasum":"22a973dab4b89ff3b98592b6caf668e921fb1660","size":10240,"noattachment":false,"key":"/cheerio-soupselect/-/cheerio-soupselect-0.1.0.tgz","tarball":"http://registry.cnpm.dingdandao.com/cheerio-soupselect/download/cheerio-soupselect-0.1.0.tgz"},"maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}],"directories":{},"publish_time":1326772292175,"_cnpm_publish_time":1326772292175,"_hasShrinkwrap":false},"0.0.3":{"name":"cheerio-soupselect","version":"0.0.3","engines":{"node":">=0.2.0"},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"url":"http://github.com/harryf/node-soupselect","dependencies":{"htmlparser2":"2.x"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"main":"./lib/soupselect","license":"MIT","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","_npmUser":{"name":"mattmueller","email":"mattmuelle@gmail.com"},"_id":"cheerio-soupselect@0.0.3","devDependencies":{},"_engineSupported":true,"_npmVersion":"1.0.103","_nodeVersion":"v0.4.11","_defaultsLoaded":true,"dist":{"shasum":"e498a5d9abc2b77f7d64e586f0849402d2398381","size":10240,"noattachment":false,"key":"/cheerio-soupselect/-/cheerio-soupselect-0.0.3.tgz","tarball":"http://registry.cnpm.dingdandao.com/cheerio-soupselect/download/cheerio-soupselect-0.0.3.tgz"},"maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}],"directories":{},"publish_time":1324287816718,"_cnpm_publish_time":1324287816718,"_hasShrinkwrap":false},"0.0.2":{"name":"cheerio-soupselect","version":"0.0.2","engines":{"node":">=0.2.0"},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"url":"http://github.com/harryf/node-soupselect","dependencies":{"htmlparser2":"1.5.x"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"main":"./lib/soupselect","license":"MIT","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","_npmUser":{"name":"mattmueller","email":"mattmuelle@gmail.com"},"_id":"cheerio-soupselect@0.0.2","devDependencies":{},"_engineSupported":true,"_npmVersion":"1.0.104","_nodeVersion":"v0.4.11","_defaultsLoaded":true,"dist":{"shasum":"ed81023842bd0109e616c0d044d10c5dc7e1e3ec","size":10240,"noattachment":false,"key":"/cheerio-soupselect/-/cheerio-soupselect-0.0.2.tgz","tarball":"http://registry.cnpm.dingdandao.com/cheerio-soupselect/download/cheerio-soupselect-0.0.2.tgz"},"maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}],"directories":{},"publish_time":1322283115232,"_cnpm_publish_time":1322283115232,"_hasShrinkwrap":false},"0.0.1":{"name":"cheerio-soupselect","version":"0.0.1","engines":{"node":">=0.2.0"},"author":{"name":"Matt Mueller","email":"mattmuelle@gmail.com"},"url":"http://github.com/harryf/node-soupselect","dependencies":{"htmlparser2":"1.5.x"},"repository":[{"type":"git","url":"git://github.com/harryf/node-soupselect.git"}],"main":"./lib/soupselect","license":"MIT","description":"Adds CSS selector support to htmlparser for scraping activities - port of soupselect (python)","_npmUser":{"name":"mattmueller","email":"mattmuelle@gmail.com"},"_id":"cheerio-soupselect@0.0.1","contributors":[{"name":"Simon Willison https://github.com/simonw"},{"name":"Harry Fuecks https://github.com/harryf"},{"name":"Chris O'Hara https://github.com/chriso"}],"devDependencies":{},"_engineSupported":true,"_npmVersion":"1.0.104","_nodeVersion":"v0.4.11","_defaultsLoaded":true,"dist":{"shasum":"ecdd4eae8ee867d6bc9dfb384c3b207f3ee255ad","size":40960,"noattachment":false,"key":"/cheerio-soupselect/-/cheerio-soupselect-0.0.1.tgz","tarball":"http://registry.cnpm.dingdandao.com/cheerio-soupselect/download/cheerio-soupselect-0.0.1.tgz"},"maintainers":[{"name":"mattmueller","email":"mattmuelle@gmail.com"}],"directories":{},"publish_time":1322281848954,"_cnpm_publish_time":1322281848954,"_hasShrinkwrap":false}},"readme":"node-soupselect\n---------------\n\nA port of Simon Willison's [soupselect](http://code.google.com/p/soupselect/) for use with node.js and node-htmlparser.\n\n    $ npm install soupselect\n\nMinimal example...\n\n    var select = require('soupselect').select;\n    // dom provided by htmlparser...\n    select(dom, \"#main a.article\").forEach(function(element) {//...});\n\nWanted a friendly way to scrape HTML using node.js. Tried using [jsdom](http://github.com/tmpvar/jsdom), prompted by [this article](http://blog.nodejitsu.com/jsdom-jquery-in-5-lines-on-nodejs) but, unfortunately, [jsdom](http://github.com/tmpvar/jsdom) takes a strict view of lax HTML making it unusable for scraping the kind of soup found in real world web pages. Luckily [htmlparser](http://github.com/tautologistics/node-htmlparser/) is more forgiving. More details on this found [here](http://www.reddit.com/r/node/comments/dm0tz/nodesoupselect_for_scraping_html_with_css/c118r23).\n\nA complete example including fetching HTML etc...;\n\n    var select = require('soupselect').select,\n        htmlparser = require(\"htmlparser\"),\n        http = require('http'),\n        sys = require('sys');\n\n    // fetch some HTML...\n    var http = require('http');\n    var host = 'www.reddit.com';\n    var client = http.createClient(80, host);\n    var request = client.request('GET', '/',{'host': host});\n\n    request.on('response', function (response) {\n        response.setEncoding('utf8');\n    \n        var body = \"\";\n        response.on('data', function (chunk) {\n            body = body + chunk;\n        });\n    \n        response.on('end', function() {\n        \n            // now we have the whole body, parse it and select the nodes we want...\n            var handler = new htmlparser.DefaultHandler(function(err, dom) {\n                if (err) {\n                    sys.debug(\"Error: \" + err);\n                } else {\n                \n                    // soupselect happening here...\n                    var titles = select(dom, 'a.title');\n                \n                    sys.puts(\"Top stories from reddit\");\n                    titles.forEach(function(title) {\n                        sys.puts(\"- \" + title.children[0].raw + \" [\" + title.attribs.href + \"]\\n\");\n                    })\n                }\n            });\n\n            var parser = new htmlparser.Parser(handler);\n            parser.parseComplete(body);\n        });\n    });\n    request.end();\n\nNotes:\n\n* Requires node-htmlparser > 1.6.2 & node.js 2+\n* Calls to select are synchronous - not worth trying to make it asynchronous IMO given the use case\n\n","_attachments":{},"license":"MIT"}