start rewriting readability
This commit is contained in:
parent
8c44d2fc87
commit
e5920259b6
8 changed files with 238 additions and 69 deletions
|
|
@ -2,9 +2,6 @@ package scraper
|
|||
|
||||
import (
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
func any(els []string, el string, match func(string, string) bool) bool {
|
||||
|
|
@ -16,44 +13,6 @@ func any(els []string, el string, match func(string, string) bool) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
func getAttr(node *html.Node, key string) string {
|
||||
for _, a := range node.Attr {
|
||||
if a.Key == key {
|
||||
return a.Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func getText(node *html.Node) string {
|
||||
text := make([]string, 0)
|
||||
isTextNode := func(n *html.Node) bool {
|
||||
return n.Type == html.TextNode
|
||||
}
|
||||
for _, n := range getNodes(node, isTextNode) {
|
||||
text = append(text, strings.TrimSpace(n.Data))
|
||||
}
|
||||
return strings.Join(text, " ")
|
||||
}
|
||||
|
||||
func getNodes(node *html.Node, match func(*html.Node) bool) []*html.Node {
|
||||
nodes := make([]*html.Node, 0)
|
||||
|
||||
queue := make([]*html.Node, 0)
|
||||
queue = append(queue, node)
|
||||
for len(queue) > 0 {
|
||||
var n *html.Node
|
||||
n, queue = queue[0], queue[1:]
|
||||
if match(n) {
|
||||
nodes = append(nodes, n)
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
queue = append(queue, c)
|
||||
}
|
||||
}
|
||||
return nodes
|
||||
}
|
||||
|
||||
func absoluteUrl(href, base string) string {
|
||||
baseUrl, err := url.Parse(base)
|
||||
if err != nil {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue