Signed-off-by: 吴文峰 <kevin@lmve.net>
This commit is contained in:
@@ -0,0 +1,153 @@
|
||||
// Package parser extracts title, description, text content, and links from HTML.
|
||||
package parser
|
||||
|
||||
import (
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
var wsRe = regexp.MustCompile(`\s+`)
|
||||
|
||||
// ParseHTML parses an HTML document and returns title, meta description, body text, and href list.
|
||||
func ParseHTML(body, baseURL string) (title, description, text string, hrefs []string) {
|
||||
// Determine base scheme+host
|
||||
base := baseFromURL(baseURL)
|
||||
basePath := pathFromURL(baseURL)
|
||||
|
||||
doc, err := html.Parse(strings.NewReader(body))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
var textParts []string
|
||||
|
||||
var dfs func(n *html.Node)
|
||||
dfs = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode {
|
||||
tag := strings.ToLower(n.Data)
|
||||
if tag == "script" || tag == "style" || tag == "svg" {
|
||||
return
|
||||
}
|
||||
if tag == "meta" {
|
||||
name := ""
|
||||
content := ""
|
||||
for _, a := range n.Attr {
|
||||
switch strings.ToLower(a.Key) {
|
||||
case "name":
|
||||
name = strings.ToLower(a.Val)
|
||||
case "content":
|
||||
content = a.Val
|
||||
}
|
||||
}
|
||||
if name == "description" && description == "" {
|
||||
description = content
|
||||
}
|
||||
}
|
||||
if tag == "a" {
|
||||
href := attrVal(n, "href")
|
||||
if href != "" {
|
||||
href = strings.SplitN(href, "#", 2)[0]
|
||||
if href != "" {
|
||||
href = resolveURL(base, basePath, href)
|
||||
if href != "" {
|
||||
hrefs = append(hrefs, href)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if n.Type == html.TextNode && n.Parent != nil {
|
||||
parentTag := ""
|
||||
if n.Parent.Type == html.ElementNode {
|
||||
parentTag = strings.ToLower(n.Parent.Data)
|
||||
}
|
||||
if parentTag == "script" || parentTag == "style" || parentTag == "svg" {
|
||||
goto children
|
||||
}
|
||||
s := wsRe.ReplaceAllString(n.Data, " ")
|
||||
s = strings.TrimSpace(s)
|
||||
if s != "" {
|
||||
if parentTag == "title" {
|
||||
title = s
|
||||
} else {
|
||||
textParts = append(textParts, s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
children:
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
dfs(c)
|
||||
}
|
||||
}
|
||||
dfs(doc)
|
||||
|
||||
text = strings.Join(textParts, " ")
|
||||
return
|
||||
}
|
||||
|
||||
func attrVal(n *html.Node, key string) string {
|
||||
for _, a := range n.Attr {
|
||||
if strings.ToLower(a.Key) == key {
|
||||
return a.Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func baseFromURL(rawURL string) string {
|
||||
idx := strings.Index(rawURL, "://")
|
||||
if idx < 0 {
|
||||
return ""
|
||||
}
|
||||
rest := rawURL[idx+3:]
|
||||
slash := strings.Index(rest, "/")
|
||||
if slash < 0 {
|
||||
return rawURL
|
||||
}
|
||||
return rawURL[:idx+3+slash]
|
||||
}
|
||||
|
||||
func pathFromURL(rawURL string) string {
|
||||
idx := strings.Index(rawURL, "://")
|
||||
if idx < 0 {
|
||||
return "/"
|
||||
}
|
||||
rest := rawURL[idx+3:]
|
||||
slash := strings.Index(rest, "/")
|
||||
if slash < 0 {
|
||||
return "/"
|
||||
}
|
||||
p := rest[slash:]
|
||||
// strip query/fragment
|
||||
p = strings.SplitN(p, "?", 2)[0]
|
||||
p = strings.SplitN(p, "#", 2)[0]
|
||||
return p
|
||||
}
|
||||
|
||||
func resolveURL(base, basePath, href string) string {
|
||||
// Absolute URL
|
||||
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||||
return href
|
||||
}
|
||||
// Protocol-relative
|
||||
if strings.HasPrefix(href, "//") {
|
||||
// extract scheme from base
|
||||
idx := strings.Index(base, "://")
|
||||
if idx < 0 {
|
||||
return ""
|
||||
}
|
||||
return base[:idx+1] + href
|
||||
}
|
||||
// Absolute path
|
||||
if strings.HasPrefix(href, "/") {
|
||||
return base + href
|
||||
}
|
||||
// Relative path
|
||||
dir := path.Dir(basePath)
|
||||
return base + path.Clean(dir+"/"+href)
|
||||
}
|
||||
Reference in New Issue
Block a user