96 lines
3.2 KiB
Lua
96 lines
3.2 KiB
Lua
|
function HTML_ToText (text)
|
|||
|
-- Declare variables, load the file. Make tags lowercase.
|
|||
|
text = string.gsub (text,"(%b<>)",
|
|||
|
function (tag)
|
|||
|
return tag:lower()
|
|||
|
end)
|
|||
|
--[[
|
|||
|
First we kill the developer formatting (tabs, CR, LF)
|
|||
|
and produce a long string with no newlines and tabs.
|
|||
|
We also kill repeated spaces as browsers ignore them anyway.
|
|||
|
]]
|
|||
|
local devkill=
|
|||
|
{
|
|||
|
["("..string.char(10)..")"] = " ",
|
|||
|
["("..string.char(13)..")"] = " ",
|
|||
|
["("..string.char(15)..")"] = "",
|
|||
|
["(%s%s+)"]=" ",
|
|||
|
}
|
|||
|
for pat, res in pairs (devkill) do
|
|||
|
text = string.gsub (text, pat, res)
|
|||
|
end
|
|||
|
-- Then we remove the header. We do this by stripping it first.
|
|||
|
text = string.gsub (text, "(<%s*head[^>]*>)", "<head>")
|
|||
|
text = string.gsub (text, "(<%s*%/%s*head%s*>)", "</head>")
|
|||
|
text = string.gsub (text, "(<head>,*<%/head>)", "")
|
|||
|
-- Kill all scripts. First we nuke their attribs.
|
|||
|
text = string.gsub (text, "(<%s*script[^>]*>)", "<script>")
|
|||
|
text = string.gsub (text, "(<%s*%/%s*script%s*>)", "</script>")
|
|||
|
text = string.gsub (text, "(<script>,*<%/script>)", "")
|
|||
|
-- Ok, same for styles.
|
|||
|
text = string.gsub (text, "(<%s*style[^>]*>)", "<style>")
|
|||
|
text = string.gsub (text, "(<%s*%/%s*style%s*>)", "</style>")
|
|||
|
text = string.gsub (text, "(<style>.*<%/style>)", "")
|
|||
|
|
|||
|
-- Replace <td> with tabulators.
|
|||
|
text = string.gsub (text, "(<%s*td[^>]*>)","\t")
|
|||
|
|
|||
|
-- Replace <br> with linebreaks.
|
|||
|
text = string.gsub (text, "(<%s*br%s*%/%s*>)","\n")
|
|||
|
|
|||
|
-- Replace <li> with an asterisk surrounded by 2 spaces.
|
|||
|
-- Replace </li> with a newline.
|
|||
|
text = string.gsub (text, "(<%s*li%s*%s*>)"," * ")
|
|||
|
text = string.gsub (text, "(<%s*/%s*li%s*%s*>)","\n")
|
|||
|
|
|||
|
-- <p>, <div>, <tr>, <ul> will be replaced to a double newline.
|
|||
|
text = string.gsub (text, "(<%s*div[^>]*>)", "\n\n")
|
|||
|
text = string.gsub (text, "(<%s*p[^>]*>)", "\n\n")
|
|||
|
text = string.gsub (text, "(<%s*tr[^>]*>)", "\n\n")
|
|||
|
text = string.gsub (text, "(<%s*%/*%s*ul[^>]*>)", "\n\n")
|
|||
|
--
|
|||
|
|
|||
|
-- Nuke all other tags now.
|
|||
|
text = string.gsub (text, "(%b<>)","")
|
|||
|
|
|||
|
-- Replace entities to their correspondant stuff where applicable.
|
|||
|
-- C# is owned badly here by using a table. :-P
|
|||
|
-- A metatable secures entities, so you can add them natively as keys.
|
|||
|
-- Enclosing brackets also get added automatically (capture!)
|
|||
|
local entities = {}
|
|||
|
setmetatable (entities,
|
|||
|
{
|
|||
|
__newindex = function (tbl, key, value)
|
|||
|
key = string.gsub (key, "(%#)" , "%%#")
|
|||
|
key = string.gsub (key, "(%&)" , "%%&")
|
|||
|
key = string.gsub (key, "(%;)" , "%%;")
|
|||
|
key = string.gsub (key, "(.+)" , "("..key..")")
|
|||
|
rawset (tbl, key, value)
|
|||
|
end
|
|||
|
})
|
|||
|
entities =
|
|||
|
{
|
|||
|
[" "] = " ",
|
|||
|
["•"] = " * ",
|
|||
|
["‹"] = "<",
|
|||
|
["›"] = ">",
|
|||
|
["™"] = "(tm)",
|
|||
|
["⁄"] = "/",
|
|||
|
["<"] = "<",
|
|||
|
[">"] = ">",
|
|||
|
["©"] = "(c)",
|
|||
|
["®"] = "(r)",
|
|||
|
-- Then kill all others.
|
|||
|
-- You can customize this table if you would like to,
|
|||
|
-- I just got bored of copypasting. :-)
|
|||
|
-- http://hotwired.lycos.com/webmonkey/reference/special_characters/
|
|||
|
["%&.+%;"] = "",
|
|||
|
}
|
|||
|
for entity, repl in pairs (entities) do
|
|||
|
text = string.gsub (text, entity, repl)
|
|||
|
end
|
|||
|
|
|||
|
return text
|
|||
|
|
|||
|
end
|