Re: a question about ScriptTag
Derrick Oswald <DerrickOswald <at> Rogers.com>
2004-12-13 12:45:04 GMT
There is a known problem with parsing script with quotes, newlines and
comments:
1024045 StringBean crashes on an URL
that might be the same problem.
Derrick
biao liu wrote:
>To get the pure text,StringBean is a nice choise.But
>these days,I find some pages that StringBean can't get
>right result.
>The raw code of the page is showed as follows(for
>simplicity,I have simplified the raw code):
><html>
><head>
><title>Finance adviser</title>
><meta http-equiv="Content-Type" content="text/html;
>charset=gb2312">
><script language=javascript>
>function chg(chgobj,chghtml)
>{
>try {
>obj=eval(chgobj);
>if (typeof(obj.length)=="number"){for
>(i=0;i<obj.length;i++)obj[i].innerHTML =chghtml}
>else {obj.innerHTML =chghtml}
>}
>catch(e){return false}
>finally{}
>}
>function changead()
>{
>adstr='<a
>href=http://web1.jrj.com.cn/Myhome/account/ShowMore.asp
>target=_blank
>class=tru><u>太阳每天都是新的,帐户每日都在增值!</u></a>'
>;
>adstr1='用基金经理、机构的视角观察股市......';
>adstr2='全新优良投资工具,透视股市的智能波段王' ;
>//chg("SpanAD1","(<a
>href=http://user.jrj.com.cn/jrjref/default.htm
>class=tru >"+adstr1+"</a>)")
>chg("SpanAD1","(<a
>href=http://www.jrj.com.cn/xfile/index.htm class=tru
>
>
>>"+adstr1+"</a>)")
>>
>>
>chg("SpanAD2","(<a
>href=http://web1.jrj.com.cn/Myhome/mystock/ShowMore.asp
>class=tbu>行情</a>,<a
>href=http://sms.jrj.com.cn/jrjsms/mysms/stock.asp
>class=tru>预警</a>,<a
>href=http://sms.jrj.com.cn/jrjsms/mysms/radar.asp
>class=tbu>轨迹</a>,<a
>href=http://user.jrj.com.cn/Custom/Default.asp
>class=tbu>资讯</a>)")
>chg("SpanAD3","("+adstr+")")
>chg("SpanAD4","(<a
>href=http://www.jrj.com.cn/ToLink.asp
>class=tru>"+adstr2+"</a>)")
>chg("SpanAD5","<table WIDTH=250 align=left
>CELLPADDING=2 CELLSPACING=2
>BGCOLOR=#f4f7fb><tr><td><OBJECT
>classid=clsid:D27CDB6E-AE6D-11cf-96B8-444553540000
>codebase=http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=5,0,0,0
>WIDTH=250 HEIGHT=214 id=ShockwaveFlash1><PARAM
>NAME=movie
>VALUE=http://www.jrj.com.cn/action/dxjzfwhzh.swf><PARAM
>NAME=scale
>VALUE=exactfit></OBJECT></td></tr></table>")
>}
>function ShowStock()
>{
>adstr='为我的股票买份保险!'
>a=new Array()
>if (Stock!="")
>{
>a=Stock.split(",")
>document.write("<tr bgcolor=#edf0f5><td
>class=tb>相关股票:")
>for (i=0;i<a.length;i++)
>{
>document.write("<a
>href=http://share.jrj.com.cn/cominfo/default.asp?or_gpdm="+a[i]+"
>class=tl>"+a[i]+"</a> ")
>}
>document.write(" <a
>href=http://sms.jrj.com.cn/jrjsms/MySMS/Stock.asp
>target=_blank><u>"+adstr+"</u></a></td></tr>")
>}
>}
>function ShowRNews()
>{
>adstr='我的行情预警系统 不再被套的秘密武器'
>if (R.length>0)
>{
>document.write('<tr bgcolor=#ffc800><td><table
>width=100% cellspacing=0 cellpadding=0><tr><td
>class=f3>相关链接:</td><td align=right><A
>href=http://sms.jrj.com.cn/jrjsms/MySMS/Stock.asp
>target=_blank
>class=f3><u>'+adstr+'</u></A></td></tr></table></td></tr>')
>document.write("<tr bgcolor=#edf0f5><td>")
>for (i=0;i<(R.length+1)/3-1;i++)
>{
>newsdate=R[i*3+1]
>tmpstr='000000000000'+R[i*3+2]
>tmpstr=tmpstr.substr(tmpstr.length-12,12)
>document.write('<a
>href=http://news1.jrj.com.cn/news/'+newsdate.substr(0,10)+'/'+tmpstr+'.html
>class=tl>'+R[i*3]+'</a>')
>document.write(" <font
>class=c6>("+R[i*3+1].substr(0,R[i*3+1].length-3)+")</font><br>")
>}
>document.write("</td></tr>")
>document.write("<tr><td align=center
>bgcolor=#edf0f5><a href=http://www.jrj.com.cn/
>class=f3
>
>
>>股票_证券_基金_财经_尽在中国金融界</a></td></tr>")
>>
>>
>}
>}
>function SendSms()
>{
>frm.Msg.value = strNewsTitle.innerText;
>frm.Urltree.value = document.all.StrUrltree.innerText;
>if (frm.Msg.value!='')
>{
>window.open('about:blank','SMS','height=468,width=502');
>frm.submit();
>}
>}
>Stock=''
>R=new Array()
></script>
><base target=_blank>
></head>
><body leftmargin=0 topmargin=5 onload=changead()>
><table><tr><td><font>something should display
>here!</font></td></tr></table>
></body></html>
><script language=javascript>
>function QuoteClick()
>{var NewPath;
>d = new Date();
>s ="Cs" + d.getUTCHours() + d.getUTCMinutes() +
>d.getUTCSeconds() + d.getUTCMilliseconds();
>if (document.symbol_entry.symbol.value=='')
>document.symbol_entry.symbol.value='000001'
>lcSelect =
>document.symbol_entry.menu1.options[document.symbol_entry.menu1.selectedIndex].value;
>if (lcSelect=='10'){
>window.open("http://quote.jrj.com.cn/htmdata/switch.asp?code="+escape(document.symbol_entry.symbol.value),"jrjminhello","height=250,width=270,status=no,toolbar=no,menubar=no,location=no,resizable=no");
>return false;}
>if (lcSelect=='11'){
>window.open("http://quote.jrj.com.cn/htmdata/gif.asp?code="+escape(document.symbol_entry.symbol.value),s,"height=290,width=434,status=no,toolbar=no,menubar=no,location=no,resizable=no","replace");
>return false;
>}
>if (lcSelect=='12'){
>window.open("http://quote.jrj.com.cn/htmdata/kline.asp?code="+escape(document.symbol_entry.symbol.value),s,"height=290,width=434,status=no,toolbar=no,menubar=no,location=no,resizable=no","replace");
>return false;
>}
>if (lcSelect=='13'){
>window.open("http://share.jrj.com.cn/cominfo/gsgg.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='14'){
>window.open("http://share.jrj.com.cn/cominfo/ggxw.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='15'){
>window.open("http://share.jrj.com.cn/cominfo/ggpl.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='16'){
>window.open("http://share.jrj.com.cn/cominfo/gsgk.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='17'){
>window.open("http://share.jrj.com.cn/cominfo/gbjg.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='18'){
>window.open("http://share.jrj.com.cn/cominfo/mgsy.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='19'){
>window.open("http://share.jrj.com.cn/cominfo/fhsp.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='20'){
>window.open("http://share.jrj.com.cn/cominfo/sdgd.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='21'){
>window.open("http://share.jrj.com.cn/cominfo/jbcwsj.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='22'){
>window.open("http://share.jrj.com.cn/cominfo/cwbl.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='23'){
>window.open("http://share.jrj.com.cn/cominfo/zcfzb.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='24'){
>window.open("http://share.jrj.com.cn/cominfo/glgs.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='25'){
>window.open("http://share.jrj.com.cn/cominfo/ggry.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='26'){
>window.open("http://share.jrj.com.cn/cominfo/zqbg.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='261'){
>window.open("http://share.jrj.com.cn/cominfo/jdbg.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='27'){
>window.open("http://share.jrj.com.cn/cominfo/ndbg.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='28'){
>window.open("http://share.jrj.com.cn/cominfo/gszc.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='29'){
>window.open("http://share.jrj.com.cn/cominfo/pgggs.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='30'){
>window.open("http://share.jrj.com.cn/cominfo/ssggs.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>if (lcSelect=='31'){
>window.open("http://share.jrj.com.cn/cominfo/jjcc.asp?or_gpdm="
>+ escape(document.symbol_entry.symbol.value));
>return false;
>}
>}
></script>
>
>The right result should be Finance adviser something
>should display here!
>but the result of StringBean is just:Finance adviser.
>
>"something should display here!"are lost.
>Then I use NodeVistor to parse the page,I find
>ScriptTag isn't rightly detected.The first <script>
>and the second </script> are to be seen as a pair.So
>the content between them are disappeared.
>
>I don't know why htmlparser can't parse this page
>correctly.I think Maybe the tag(ie.<td>) in the first
>pair of script confused the htmlparser.How can I parse
>this page correctly?
>
>
>
>
-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now.
http://productguide.itmanagersjournal.com/