使用 parse.com 的云代码,我试图从网页上抓取数据以发送到我的 iOS 应用程序。我已经在 iOS 中本地实现了网络抓取代码,但我正在尝试将此任务移至后端。我正在使用一个名为 xpath.js 的 node.js 库
Parse.Cloud.define("test", function(request, response) {
Parse.Cloud.httpRequest({
url: "http://menu.ha.ucla.edu/foodpro/default.asp",
success: function(httpResponse) {
var text = httpResponse.text;
var xpath = require("cloud/xpath.js"), dom = require("cloud/dom-parser.js").DOMParser;
var doc = new dom().parseFromString(text);
var cells = xpath.select("//td[starts-with(@class, 'menugridcell')]", doc);
response.success("test " + cells.count);
var listNode = xpath.select("//ul", cells[0])[0];
},
error: function(httpResponse) {
console.error('Request failed with response code ' + httpResponse.status);
}
});
});
但是,当我运行代码时,我收到此错误:
"Uncaught end tag name: div is not match the current start tagName:script"
正如我之前提到的,我已经能够使用单独的 objective-c 库成功地抓取网络数据,因此标签是一致的,问题不可能出在源代码中。
对于源代码,这里是webpage I'm scraping . StackOverflow 不允许我直接链接到源代码,否则我会给出一个直接链接。
编辑:
这是dom-parser.js中的代码
function DOMParser(options){
this.options = options ||{locator:{}};
}
DOMParser.prototype.parseFromString = function(source,mimeType){
var options = this.options;
var sax = new XMLReader();
var domBuilder = options.domBuilder || new DOMHandler();//contentHandler and LexicalHandler
var errorHandler = options.errorHandler;
var locator = options.locator;
var defaultNSMap = options.xmlns||{};
var entityMap = {'lt':'<','gt':'>','amp':'&','quot':'"','apos':"'"}
if(locator){
domBuilder.setDocumentLocator(locator)
}
sax.errorHandler = buildErrorHandler(errorHandler,domBuilder,locator);
sax.domBuilder = options.domBuilder || domBuilder;
if(/\/x?html?$/.test(mimeType)){
entityMap.nbsp = '\xa0';
entityMap.copy = '\xa9';
defaultNSMap['']= 'http://www.w3.org/1999/xhtml';
}
if(source){
sax.parse(source,defaultNSMap,entityMap);
}else{
sax.errorHandler.error("invalid document source");
}
return domBuilder.document;
}
function buildErrorHandler(errorImpl,domBuilder,locator){
if(!errorImpl){
if(domBuilder instanceof DOMHandler){
return domBuilder;
}
errorImpl = domBuilder ;
}
var errorHandler = {}
var isCallback = errorImpl instanceof Function;
locator = locator||{}
function build(key){
var fn = errorImpl[key];
if(!fn){
if(isCallback){
fn = errorImpl.length == 2?function(msg){errorImpl(key,msg)}:errorImpl;
}else{
var i=arguments.length;
while(--i){
if(fn = errorImpl[arguments[i]]){
break;
}
}
}
}
errorHandler[key] = fn && function(msg){
fn(msg+_locator(locator));
}||function(){};
}
build('warning','warn');
build('error','warn','warning');
build('fatalError','warn','warning','error');
return errorHandler;
}
/**
* +ContentHandler+ErrorHandler
* +LexicalHandler+EntityResolver2
* -DeclHandler-DTDHandler
*
* DefaultHandler:EntityResolver, DTDHandler, ContentHandler, ErrorHandler
* DefaultHandler2:DefaultHandler,LexicalHandler, DeclHandler, EntityResolver2
* @link http://www.saxproject.org/apidoc/org/xml/sax/helpers/DefaultHandler.html
*/
function DOMHandler() {
this.cdata = false;
}
function position(locator,node){
node.lineNumber = locator.lineNumber;
node.columnNumber = locator.columnNumber;
}
/**
* @see org.xml.sax.ContentHandler#startDocument
* @link http://www.saxproject.org/apidoc/org/xml/sax/ContentHandler.html
*/
DOMHandler.prototype = {
startDocument : function() {
this.document = new DOMImplementation().createDocument(null, null, null);
if (this.locator) {
this.document.documentURI = this.locator.systemId;
}
},
startElement:function(namespaceURI, localName, qName, attrs) {
var doc = this.document;
var el = doc.createElementNS(namespaceURI, qName||localName);
var len = attrs.length;
appendElement(this, el);
this.currentElement = el;
this.locator && position(this.locator,el)
for (var i = 0 ; i < len; i++) {
var namespaceURI = attrs.getURI(i);
var value = attrs.getValue(i);
var qName = attrs.getQName(i);
var attr = doc.createAttributeNS(namespaceURI, qName);
if( attr.getOffset){
position(attr.getOffset(1),attr)
}
attr.value = attr.nodeValue = value;
el.setAttributeNode(attr)
}
},
endElement:function(namespaceURI, localName, qName) {
var current = this.currentElement
var tagName = current.tagName;
this.currentElement = current.parentNode;
},
startPrefixMapping:function(prefix, uri) {
},
endPrefixMapping:function(prefix) {
},
processingInstruction:function(target, data) {
var ins = this.document.createProcessingInstruction(target, data);
this.locator && position(this.locator,ins)
appendElement(this, ins);
},
ignorableWhitespace:function(ch, start, length) {
},
characters:function(chars, start, length) {
chars = _toString.apply(this,arguments)
//console.log(chars)
if(this.currentElement && chars){
if (this.cdata) {
var charNode = this.document.createCDATASection(chars);
this.currentElement.appendChild(charNode);
} else {
var charNode = this.document.createTextNode(chars);
this.currentElement.appendChild(charNode);
}
this.locator && position(this.locator,charNode)
}
},
skippedEntity:function(name) {
},
endDocument:function() {
this.document.normalize();
},
setDocumentLocator:function (locator) {
if(this.locator = locator){// && !('lineNumber' in locator)){
locator.lineNumber = 0;
}
},
//LexicalHandler
comment:function(chars, start, length) {
chars = _toString.apply(this,arguments)
var comm = this.document.createComment(chars);
this.locator && position(this.locator,comm)
appendElement(this, comm);
},
startCDATA:function() {
//used in characters() methods
this.cdata = true;
},
endCDATA:function() {
this.cdata = false;
},
startDTD:function(name, publicId, systemId) {
var impl = this.document.implementation;
if (impl && impl.createDocumentType) {
var dt = impl.createDocumentType(name, publicId, systemId);
this.locator && position(this.locator,dt)
appendElement(this, dt);
}
},
/**
* @see org.xml.sax.ErrorHandler
* @link http://www.saxproject.org/apidoc/org/xml/sax/ErrorHandler.html
*/
warning:function(error) {
console.warn(error,_locator(this.locator));
},
error:function(error) {
console.error(error,_locator(this.locator));
},
fatalError:function(error) {
console.error(error,_locator(this.locator));
throw error;
}
}
function _locator(l){
if(l){
return '\n@'+(l.systemId ||'')+'#[line:'+l.lineNumber+',col:'+l.columnNumber+']'
}
}
function _toString(chars,start,length){
if(typeof chars == 'string'){
return chars.substr(start,length)
}else{//java sax connect width xmldom on rhino(what about: "? && !(chars instanceof String)")
if(chars.length >= start+length || start){
return new java.lang.String(chars,start,length)+'';
}
return chars;
}
}
/*
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/LexicalHandler.html
* used method of org.xml.sax.ext.LexicalHandler:
* #comment(chars, start, length)
* #startCDATA()
* #endCDATA()
* #startDTD(name, publicId, systemId)
*
*
* IGNORED method of org.xml.sax.ext.LexicalHandler:
* #endDTD()
* #startEntity(name)
* #endEntity(name)
*
*
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/DeclHandler.html
* IGNORED method of org.xml.sax.ext.DeclHandler
* #attributeDecl(eName, aName, type, mode, value)
* #elementDecl(name, model)
* #externalEntityDecl(name, publicId, systemId)
* #internalEntityDecl(name, value)
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/EntityResolver2.html
* IGNORED method of org.xml.sax.EntityResolver2
* #resolveEntity(String name,String publicId,String baseURI,String systemId)
* #resolveEntity(publicId, systemId)
* #getExternalSubset(name, baseURI)
* @link http://www.saxproject.org/apidoc/org/xml/sax/DTDHandler.html
* IGNORED method of org.xml.sax.DTDHandler
* #notationDecl(name, publicId, systemId) {};
* #unparsedEntityDecl(name, publicId, systemId, notationName) {};
*/
"endDTD,startEntity,endEntity,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,resolveEntity,getExternalSubset,notationDecl,unparsedEntityDecl".replace(/\w+/g,function(key){
DOMHandler.prototype[key] = function(){return null}
})
/* Private static helpers treated below as private instance methods, so don't need to add these to the public API; we might use a Relator to also get rid of non-standard public properties */
function appendElement (hander,node) {
if (!hander.currentElement) {
hander.document.appendChild(node);
} else {
hander.currentElement.appendChild(node);
}
}//appendChild and setAttributeNS are preformance key
if(typeof require == 'function'){
var XMLReader = require('cloud/sax').XMLReader;
var DOMImplementation = exports.DOMImplementation = require('cloud/dom').DOMImplementation;
exports.XMLSerializer = require('cloud/dom').XMLSerializer ;
exports.DOMParser = DOMParser;
}
最佳答案
给定的页面在 html 脚本中包含一些 XML 标记。开始标签可能会被忽略,因为它们包含转义引号。解析器找到 </div> (在脚本中的字符串中)并尝试将其与开头 <script> 匹配并失败了。您的解析器试图读取 XML,但不知道 xhtml 脚本区域是 CData。
您必须告诉解析器忽略(或读取为 CData)脚本标记。抱歉,我不知道该怎么做。
向你致以最诚挚的问候
关于javascript - 第 3 方 XML 解析器 (xpath.js) 给出错误 "Uncaught end tag name: div is not match the current start tagName",我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/26074431/
我正在学习如何使用Nokogiri,根据这段代码我遇到了一些问题:require'rubygems'require'mechanize'post_agent=WWW::Mechanize.newpost_page=post_agent.get('http://www.vbulletin.org/forum/showthread.php?t=230708')puts"\nabsolutepathwithtbodygivesnil"putspost_page.parser.xpath('/html/body/div/div/div/div/div/table/tbody/tr/td/div
我有一个字符串input="maybe(thisis|thatwas)some((nice|ugly)(day|night)|(strange(weather|time)))"Ruby中解析该字符串的最佳方法是什么?我的意思是脚本应该能够像这样构建句子:maybethisissomeuglynightmaybethatwassomenicenightmaybethiswassomestrangetime等等,你明白了......我应该一个字符一个字符地读取字符串并构建一个带有堆栈的状态机来存储括号值以供以后计算,还是有更好的方法?也许为此目的准备了一个开箱即用的库?
我正在尝试测试是否存在表单。我是Rails新手。我的new.html.erb_spec.rb文件的内容是:require'spec_helper'describe"messages/new.html.erb"doit"shouldrendertheform"dorender'/messages/new.html.erb'reponse.shouldhave_form_putting_to(@message)with_submit_buttonendendView本身,new.html.erb,有代码:当我运行rspec时,它失败了:1)messages/new.html.erbshou
我在从html页面生成PDF时遇到问题。我正在使用PDFkit。在安装它的过程中,我注意到我需要wkhtmltopdf。所以我也安装了它。我做了PDFkit的文档所说的一切......现在我在尝试加载PDF时遇到了这个错误。这里是错误:commandfailed:"/usr/local/bin/wkhtmltopdf""--margin-right""0.75in""--page-size""Letter""--margin-top""0.75in""--margin-bottom""0.75in""--encoding""UTF-8""--margin-left""0.75in""-
我主要使用Ruby来执行此操作,但到目前为止我的攻击计划如下:使用gemsrdf、rdf-rdfa和rdf-microdata或mida来解析给定任何URI的数据。我认为最好映射到像schema.org这样的统一模式,例如使用这个yaml文件,它试图描述数据词汇表和opengraph到schema.org之间的转换:#SchemaXtoschema.orgconversion#data-vocabularyDV:name:namestreet-address:streetAddressregion:addressRegionlocality:addressLocalityphoto:i
我正在使用ruby1.9解析以下带有MacRoman字符的csv文件#encoding:ISO-8859-1#csv_parse.csvName,main-dialogue"Marceu","Giveittohimóhe,hiswife."我做了以下解析。require'csv'input_string=File.read("../csv_parse.rb").force_encoding("ISO-8859-1").encode("UTF-8")#=>"Name,main-dialogue\r\n\"Marceu\",\"Giveittohim\x97he,hiswife.\"\
我有一个对象has_many应呈现为xml的子对象。这不是问题。我的问题是我创建了一个Hash包含此数据,就像解析器需要它一样。但是rails自动将整个文件包含在.........我需要摆脱type="array"和我该如何处理?我没有在文档中找到任何内容。 最佳答案 我遇到了同样的问题;这是我的XML:我在用这个:entries.to_xml将散列数据转换为XML,但这会将条目的数据包装到中所以我修改了:entries.to_xml(root:"Contacts")但这仍然将转换后的XML包装在“联系人”中,将我的XML代码修改为
为了将Cucumber用于命令行脚本,我按照提供的说明安装了arubagem。它在我的Gemfile中,我可以验证是否安装了正确的版本并且我已经包含了require'aruba/cucumber'在'features/env.rb'中为了确保它能正常工作,我写了以下场景:@announceScenario:Testingcucumber/arubaGivenablankslateThentheoutputfrom"ls-la"shouldcontain"drw"假设事情应该失败。它确实失败了,但失败的原因是错误的:@announceScenario:Testingcucumber/ar
我遵循MichaelHartl的“RubyonRails教程:学习Web开发”,并创建了检查用户名和电子邮件长度有效性的测试(名称最多50个字符,电子邮件最多255个字符)。test/helpers/application_helper_test.rb的内容是:require'test_helper'classApplicationHelperTest在运行bundleexecraketest时,所有测试都通过了,但我看到以下消息在最后被标记为错误:ERROR["test_full_title_helper",ApplicationHelperTest,1.820016791]test
我正在尝试从Postgresql表(table1)中获取数据,该表由另一个相关表(property)的字段(table2)过滤。在纯SQL中,我会这样编写查询:SELECT*FROMtable1JOINtable2USING(table2_id)WHEREtable2.propertyLIKE'query%'这工作正常:scope:my_scope,->(query){includes(:table2).where("table2.property":query)}但我真正需要的是使用LIKE运算符进行过滤,而不是严格相等。然而,这是行不通的:scope:my_scope,->(que