标记语言处理模型(演示版)
添加时间: 2005-6-9 5:05:03 作者: 网络收集 阅读次数:51 来源: http://d9soft.com
这是一个模仿html,xml语言从文本转化为对象这一过程的模型,并降低了一些如tagName不能修改,包围标记只能是“<”、“>”等的限制,尽可能的扩大对文本的自由处理。
通过这个模型也就可以制作出如 html 和 ubb 的双向转换程序,Internet Explorer中显示XML文档一样有hightLight和折叠功能的视图等一系列的关于标记语言的实例程序。
标记语言处理模型
<TextArea id="code" rows=15 cols=100>
[b color=#FF0000]aaa
<B>[i]bbb[/i]</B>
<B>ccc</B>
[u]eeee
[/b]
<B>ddd</B>
</TextArea><br />
<h3>属性</h3>
<button onclick="alert(firstchildren.tagName)">tagName</button>
<button onclick="alert(firstchildren.childNodes)">childNodes</button>
<button onclick="alert(firstchildren.attributes)">attributes</button>
<hr />
<h3>方法</h3>
<button onclick="alert(firstchildren.getOuterHTML())">getOuterHTML</button>
<button onclick="alert(firstchildren.getInnerHTML())">getInnerHTML</button>
<button onclick="alert(firstchildren.getAttributeNode('color').value)">getAttributeNode</button>
<button onclick="alert(firstchildren.getChildren())">getChildren</button>
<button onclick="alert(firstchildren.all())">all</button>
<hr />
<h3>实例</h3>
<script>
function ubb2html()
{
var ubbDocument = MarkupLanguageDocument(code.value, "[", "]")
var allObject = ubbDocument.all()
for (var i=0; i<allObject.length; i++)
{
if (allObject[i].tagName == "b")
{
allObject[i].lt = "<"
allObject[i].gt = ">"
}
}
alert(ubbDocument.getOuterHTML())
}
</script>
<button onclick="ubb2html()">ubb2html</button>
<xmp style="background-color: #EEEEEE; padding: 10px;">
function ubb2html()
{
var ubbDocument = MarkupLanguageDocument(code.value, "[", "]")
var allObject = ubbDocument.all()
for (var i=0; i<allObject.length; i++)
{
if (allObject[i].tagName == "b")
{
allObject[i].lt = "<"
allObject[i].gt = ">"
}
}
alert(ubbDocument.getOuterHTML())
}
</xmp>
<script>
String.prototype.getAbsReg = function(s)
{
var absStr = this.match(/(\\______rela:-?\d+______\\.[^\\]*[^\\]*)/g)
while (absStr.length-1)
{
if (/^\\______rela:-?\d+______$/.test(absStr[1])) absStr[1] = "\\" + (parseInt(absStr[1].match(/-?\d+/g)) + absStr[0].match(/\(/g).length)
absStr[0] = absStr.shift() + absStr[0]
}
return new RegExp(absStr.join(""), s)
}
RegExp.prototype.getRelaStr = function()
{
var regStr = this.source
var relaStr = regStr.match(/(\\\d+\\.[^\\]*[^\\]*)/g)
while (relaStr.length-1)
{
if (/^\\\d+$/.test(relaStr[1])) relaStr[1] = "\\______rela:" + (parseInt(relaStr[1].match(/\d+/g)) - relaStr[0].match(/\(/g).length) + "______"
relaStr[0] = relaStr.shift() + relaStr[0]
}
return relaStr.join("")
}
</script>
<script>
function MarkupLanguageDocument(html, lt, gt)
{
function htmlElement(tagName)
{
this.tagName = tagName
this.childNodes = new Array()
this.attributes = new Array()
this.setInnerHTML = function(html, lt, gt)
{
this.childNodes = parseHTML(html, lt, gt)
return this.childNodes
}
this.getInnerHTML = function ()
{
var childrenHTML = ""
for (var i=0; i<this.childNodes.length; i++)
{
childrenHTML += (this.constructor == this.childNodes[i].constructor) ? this.childNodes[i].getOuterHTML() : this.childNodes[i]
}
return childrenHTML
}
this.getOuterHTML = function()
{
var tagName = this.tagName
var childrenHTML = this.getInnerHTML()
var attrHTML = ""
for (var i=0; i<this.attributes.length; i++)
{
attrHTML += (this.attributes[i].name + (this.attributes[i].value ? "=\"" + this.attributes[i].value + "\" " : " "))
}
return tagName ? this.lt + tagName + (attrHTML ? " "+attrHTML : "") + (childrenHTML ? this.gt + childrenHTML + this.lt + "/" + tagName + this.gt : "/" + this.gt) : childrenHTML
}
this.getAttributeNode = function(name)
{
for (var i=0; i<this.attributes.length; i++)
{
if (this.attributes[i].name == name) return this.attributes[i]
}
return null
}
this.getChildren = function()
{
var childrenArray = new Array()
for (var i=0; i<this.childNodes.length; i++)
{
if (this.constructor == this.childNodes[i].constructor) childrenArray[childrenArray.length] = this.childNodes[i]
}
return childrenArray
}
this.all = function()
{
var allArray = new Array()
var children = this.getChildren()
for (var i=0; i<children.length; i++)
{
allArray[allArray.length] = children[i]
if (children[i].getChildren())
{
allArray = allArray.concat(children[i].all())
}
}
return allArray
}
}
function attributeObj(name, value)
{
this.name = name
this.value = value
}
function parseHTML(html, lt, gt)
{
var lt = lt ? lt.charAt(0) : "<"
var gt = gt ? gt.charAt(0) : ">"
var lt1 = lt.replace(/([\[\]\.])/, "\\$1")
var gt1 = gt.replace(/([\[\]\.])/, "\\$1")
//字符串正则: 如 "abc"
var strRe = /(["'])(\\["'tbnr][^\1])*?\1/
//属性正则: 如 <span attribute="abc"> 中的 attribute="abc"
var attrRe = ("(\\w+)(\\s*=\\s*(" + strRe.getRelaStr() + "[^\\s"+gt1+"]+).{0})").getAbsReg()
//标签名正则: 如 <span attribute="abc"> 中的 span
var tagRe = /((\w+:)?\w+)/
//标识单Target正则: 如 <span /> 中的 /
var sTagRe = new RegExp("\\\/\\s*(?=" + gt1 + ")")
//标识Target关闭正则: 如 </span> 中的 /
var cTagRe = /\/\s*/
//对象正则: 如 <span attribute="abc" />
var objRe = (lt1 + "(" + cTagRe.getRelaStr() + ")?" + tagRe.getRelaStr() + "(" + attrRe.getRelaStr() + "\\s)*?(" + sTagRe.getRelaStr() + ")?" + gt1).getAbsReg()
//节点正则: 如 this is a <img src="simple.jpg" /> 中的 this is a 和 <img src="simple.jpg" />
var nodeRe = ("(" + objRe.getRelaStr() + "[^" + lt1 + "]*)").getAbsReg()
var htmlArray = new Array()
var node = ""
var tmpObj = null
function getfisrtNode(html)
{
return new function()
{
this.nodeValue = nodeRe.test(html) ? html.match(nodeRe)[0] : ""
this.otherHTML = html.substr(this.nodeValue.length, html.length)
this.tagName = tagRe.test(this.nodeValue) ? this.nodeValue.match(tagRe)[0] : undefined
var tmpAttrStr = this.nodeValue.replace(tagRe, "")
var tmpArrtRe = new RegExp(attrRe.source, "g")
this.attrs = tmpArrtRe.test(tmpAttrStr) ? tmpAttrStr.match(tmpArrtRe) : new Array()
}
}
while(html)
{
tmpNode = getfisrtNode(html)
node = tmpNode.nodeValue
html = tmpNode.otherHTML
if (objRe.test(node))
{
var tagName = tmpNode.tagName
var childrenhtml= ""
var j = sTagRe.test(node) ? 0 : 1
var attrsObj = new Array()
node = new htmlElement(tagName)
for (var i=0; i<tmpNode.attrs.length; i++)
{
var tmpAr = tmpNode.attrs[i].match(attrRe)
attrsObj[i] = new attributeObj(tmpAr[1], tmpAr[3].replace(/^(["'])([.\s\S]*?)\1$/, "$2"))
}
node.attributes = attrsObj
node.lt = lt
node.gt = gt
while (j && html)
{
var tmpNode = getfisrtNode(html)
html = tmpNode.otherHTML
if (tmpNode.tagName == tagName && !(sTagRe.test(tmpNode.nodeValue)))
{
("^" + lt1 + cTagRe.getRelaStr()).getAbsReg().test(tmpNode.nodeValue) ? j-- : j++
}
if (j) childrenhtml +=tmpNode.nodeValue
}
node.childNodes = parseHTML(childrenhtml, lt, gt)
}
htmlArray[htmlArray.length] = node
}
return htmlArray
}
var mlDocument = new htmlElement()
mlDocument.childNodes = parseHTML(html, lt, gt)
return mlDocument
}
var parseDocument = MarkupLanguageDocument(code.value, "[", "]")
var firstchildren = parseDocument.childNodes[1]
</script>
这个程序的处理方法大致是通过正则表达式的拼接(getAbsReg 和 getRelaStr 这两个自定义函数),从 字符串 → 属性 → 对象 → 节点 一级级的拼接出他们的正则表达式,然后再从节点开始反向的从文本流中提取对应的内容,再将他们设置到 htmlElement 类 的相应的属性上,也就是看到了 [object object]。
最后实例程序通过设置和使用 htmlElement 中提供属性和方法,将文本的每一个细节修改到最终所需要的状态,最后使用getOutetHTML或是getInnerHTML方法得到处理之后的文本流。
另外,用这种方法处理还可以达到标记自动补全的功能。
上一篇文章: IE色彩处理过程 下一篇文章: 显示行号的文本输入框

