// Package parser extracts title, description, text content, and links from HTML. package parser import ( "path" "regexp" "strings" "golang.org/x/net/html" ) var wsRe = regexp.MustCompile(`\s+`) // ParseHTML parses an HTML document and returns title, meta description, body text, and href list. func ParseHTML(body, baseURL string) (title, description, text string, hrefs []string) { // Determine base scheme+host base := baseFromURL(baseURL) basePath := pathFromURL(baseURL) doc, err := html.Parse(strings.NewReader(body)) if err != nil { return } var textParts []string var dfs func(n *html.Node) dfs = func(n *html.Node) { if n.Type == html.ElementNode { tag := strings.ToLower(n.Data) if tag == "script" || tag == "style" || tag == "svg" { return } if tag == "meta" { name := "" content := "" for _, a := range n.Attr { switch strings.ToLower(a.Key) { case "name": name = strings.ToLower(a.Val) case "content": content = a.Val } } if name == "description" && description == "" { description = content } } if tag == "a" { href := attrVal(n, "href") if href != "" { href = strings.SplitN(href, "#", 2)[0] if href != "" { href = resolveURL(base, basePath, href) if href != "" { hrefs = append(hrefs, href) } } } } } if n.Type == html.TextNode && n.Parent != nil { parentTag := "" if n.Parent.Type == html.ElementNode { parentTag = strings.ToLower(n.Parent.Data) } if parentTag == "script" || parentTag == "style" || parentTag == "svg" { goto children } s := wsRe.ReplaceAllString(n.Data, " ") s = strings.TrimSpace(s) if s != "" { if parentTag == "title" { title = s } else { textParts = append(textParts, s) } } } children: for c := n.FirstChild; c != nil; c = c.NextSibling { dfs(c) } } dfs(doc) text = strings.Join(textParts, " ") return } func attrVal(n *html.Node, key string) string { for _, a := range n.Attr { if strings.ToLower(a.Key) == key { return a.Val } } return "" } func baseFromURL(rawURL string) string { idx := strings.Index(rawURL, "://") if idx < 0 { return "" } rest := rawURL[idx+3:] slash := strings.Index(rest, "/") if slash < 0 { return rawURL } return rawURL[:idx+3+slash] } func pathFromURL(rawURL string) string { idx := strings.Index(rawURL, "://") if idx < 0 { return "/" } rest := rawURL[idx+3:] slash := strings.Index(rest, "/") if slash < 0 { return "/" } p := rest[slash:] // strip query/fragment p = strings.SplitN(p, "?", 2)[0] p = strings.SplitN(p, "#", 2)[0] return p } func resolveURL(base, basePath, href string) string { // Absolute URL if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") { return href } // Protocol-relative if strings.HasPrefix(href, "//") { // extract scheme from base idx := strings.Index(base, "://") if idx < 0 { return "" } return base[:idx+1] + href } // Absolute path if strings.HasPrefix(href, "/") { return base + href } // Relative path dir := path.Dir(basePath) return base + path.Clean(dir+"/"+href) }