154 lines
3.1 KiB
Go
154 lines
3.1 KiB
Go
// Package parser extracts title, description, text content, and links from HTML.
|
|
package parser
|
|
|
|
import (
|
|
"path"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
var wsRe = regexp.MustCompile(`\s+`)
|
|
|
|
// ParseHTML parses an HTML document and returns title, meta description, body text, and href list.
|
|
func ParseHTML(body, baseURL string) (title, description, text string, hrefs []string) {
|
|
// Determine base scheme+host
|
|
base := baseFromURL(baseURL)
|
|
basePath := pathFromURL(baseURL)
|
|
|
|
doc, err := html.Parse(strings.NewReader(body))
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
var textParts []string
|
|
|
|
var dfs func(n *html.Node)
|
|
dfs = func(n *html.Node) {
|
|
if n.Type == html.ElementNode {
|
|
tag := strings.ToLower(n.Data)
|
|
if tag == "script" || tag == "style" || tag == "svg" {
|
|
return
|
|
}
|
|
if tag == "meta" {
|
|
name := ""
|
|
content := ""
|
|
for _, a := range n.Attr {
|
|
switch strings.ToLower(a.Key) {
|
|
case "name":
|
|
name = strings.ToLower(a.Val)
|
|
case "content":
|
|
content = a.Val
|
|
}
|
|
}
|
|
if name == "description" && description == "" {
|
|
description = content
|
|
}
|
|
}
|
|
if tag == "a" {
|
|
href := attrVal(n, "href")
|
|
if href != "" {
|
|
href = strings.SplitN(href, "#", 2)[0]
|
|
if href != "" {
|
|
href = resolveURL(base, basePath, href)
|
|
if href != "" {
|
|
hrefs = append(hrefs, href)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if n.Type == html.TextNode && n.Parent != nil {
|
|
parentTag := ""
|
|
if n.Parent.Type == html.ElementNode {
|
|
parentTag = strings.ToLower(n.Parent.Data)
|
|
}
|
|
if parentTag == "script" || parentTag == "style" || parentTag == "svg" {
|
|
goto children
|
|
}
|
|
s := wsRe.ReplaceAllString(n.Data, " ")
|
|
s = strings.TrimSpace(s)
|
|
if s != "" {
|
|
if parentTag == "title" {
|
|
title = s
|
|
} else {
|
|
textParts = append(textParts, s)
|
|
}
|
|
}
|
|
}
|
|
|
|
children:
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
dfs(c)
|
|
}
|
|
}
|
|
dfs(doc)
|
|
|
|
text = strings.Join(textParts, " ")
|
|
return
|
|
}
|
|
|
|
func attrVal(n *html.Node, key string) string {
|
|
for _, a := range n.Attr {
|
|
if strings.ToLower(a.Key) == key {
|
|
return a.Val
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func baseFromURL(rawURL string) string {
|
|
idx := strings.Index(rawURL, "://")
|
|
if idx < 0 {
|
|
return ""
|
|
}
|
|
rest := rawURL[idx+3:]
|
|
slash := strings.Index(rest, "/")
|
|
if slash < 0 {
|
|
return rawURL
|
|
}
|
|
return rawURL[:idx+3+slash]
|
|
}
|
|
|
|
func pathFromURL(rawURL string) string {
|
|
idx := strings.Index(rawURL, "://")
|
|
if idx < 0 {
|
|
return "/"
|
|
}
|
|
rest := rawURL[idx+3:]
|
|
slash := strings.Index(rest, "/")
|
|
if slash < 0 {
|
|
return "/"
|
|
}
|
|
p := rest[slash:]
|
|
// strip query/fragment
|
|
p = strings.SplitN(p, "?", 2)[0]
|
|
p = strings.SplitN(p, "#", 2)[0]
|
|
return p
|
|
}
|
|
|
|
func resolveURL(base, basePath, href string) string {
|
|
// Absolute URL
|
|
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
|
return href
|
|
}
|
|
// Protocol-relative
|
|
if strings.HasPrefix(href, "//") {
|
|
// extract scheme from base
|
|
idx := strings.Index(base, "://")
|
|
if idx < 0 {
|
|
return ""
|
|
}
|
|
return base[:idx+1] + href
|
|
}
|
|
// Absolute path
|
|
if strings.HasPrefix(href, "/") {
|
|
return base + href
|
|
}
|
|
// Relative path
|
|
dir := path.Dir(basePath)
|
|
return base + path.Clean(dir+"/"+href)
|
|
}
|