yarr/src/worker/crawler.go

201 lines
4.5 KiB
Go

package worker
import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"time"
"github.com/nkanaev/yarr/src/content/scraper"
"github.com/nkanaev/yarr/src/parser"
"github.com/nkanaev/yarr/src/storage"
"golang.org/x/net/html/charset"
)
type FeedSource struct {
Title string `json:"title"`
Url string `json:"url"`
}
type DiscoverResult struct {
Feed *parser.Feed
FeedLink string
Sources []FeedSource
}
func DiscoverFeed(candidateUrl string) (*DiscoverResult, error) {
result := &DiscoverResult{}
// Query URL
res, err := client.get(candidateUrl)
if err != nil {
return nil, err
}
defer res.Body.Close()
if res.StatusCode != 200 {
return nil, fmt.Errorf("status code %d", res.StatusCode)
}
body, err := charset.NewReader(res.Body, res.Header.Get("Content-Type"))
if err != nil {
return nil, err
}
content, err := ioutil.ReadAll(body)
if err != nil {
return nil, err
}
// Try to feed into parser
feed, err := parser.Parse(bytes.NewReader(content))
if err == nil {
feed.TranslateURLs(candidateUrl)
feed.SetMissingDatesTo(time.Now())
result.Feed = feed
result.FeedLink = candidateUrl
return result, nil
}
// Possibly an html link. Search for feed links
sources := make([]FeedSource, 0)
for url, title := range scraper.FindFeeds(string(content), candidateUrl) {
sources = append(sources, FeedSource{Title: title, Url: url})
}
switch {
case len(sources) == 0:
return nil, errors.New("No feeds found at the given url")
case len(sources) == 1:
if sources[0].Url == candidateUrl {
return nil, errors.New("Recursion!")
}
return DiscoverFeed(sources[0].Url)
}
result.Sources = sources
return result, nil
}
func findFavicon(websiteUrl, feedUrl string) (*[]byte, error) {
candidateUrls := make([]string, 0)
favicon := func(link string) string {
u, err := url.Parse(link)
if err != nil {
return ""
}
return fmt.Sprintf("%s://%s/favicon.ico", u.Scheme, u.Host)
}
if len(websiteUrl) != 0 {
res, err := client.get(websiteUrl)
if err != nil {
return nil, err
}
body, err := ioutil.ReadAll(res.Body)
defer res.Body.Close()
if err != nil {
return nil, err
}
candidateUrls = append(candidateUrls, scraper.FindIcons(string(body), websiteUrl)...)
if c := favicon(websiteUrl); len(c) != 0 {
candidateUrls = append(candidateUrls, c)
}
}
if c := favicon(feedUrl); len(c) != 0 {
candidateUrls = append(candidateUrls, c)
}
imageTypes := [4]string{
"image/x-icon",
"image/png",
"image/jpeg",
"image/gif",
}
for _, url := range candidateUrls {
res, err := client.get(url)
if err != nil {
continue
}
defer res.Body.Close()
if res.StatusCode == 200 {
if content, err := ioutil.ReadAll(res.Body); err == nil {
ctype := http.DetectContentType(content)
for _, itype := range imageTypes {
if ctype == itype {
return &content, nil
}
}
}
}
}
return nil, nil
}
func ConvertItems(items []parser.Item, feed storage.Feed) []storage.Item {
result := make([]storage.Item, len(items))
for i, item := range items {
item := item
var audioURL *string = nil
if item.AudioURL != "" {
audioURL = &item.AudioURL
}
var imageURL *string = nil
if item.ImageURL != "" {
imageURL = &item.ImageURL
}
result[i] = storage.Item{
GUID: item.GUID,
FeedId: feed.Id,
Title: item.Title,
Link: item.URL,
Content: item.Content,
Date: item.Date,
Status: storage.UNREAD,
ImageURL: imageURL,
AudioURL: audioURL,
}
}
return result
}
func listItems(f storage.Feed, db *storage.Storage) ([]storage.Item, error) {
lmod := ""
etag := ""
if state := db.GetHTTPState(f.Id); state != nil {
lmod = state.LastModified
etag = state.Etag
}
res, err := client.getConditional(f.FeedLink, lmod, etag)
if err != nil {
return nil, fmt.Errorf("unable to get: %s", err)
}
defer res.Body.Close()
switch {
case res.StatusCode < 200 || res.StatusCode > 399:
return nil, fmt.Errorf("status code %d", res.StatusCode)
case res.StatusCode == http.StatusNotModified:
return nil, nil
}
body, err := charset.NewReader(res.Body, res.Header.Get("Content-Type"))
if err != nil {
return nil, fmt.Errorf("failed to init response body: %s", err)
}
feed, err := parser.Parse(body)
if err != nil {
return nil, fmt.Errorf("failed to parse: %s", err)
}
lmod = res.Header.Get("Last-Modified")
etag = res.Header.Get("Etag")
if lmod != "" || etag != "" {
db.SetHTTPState(f.Id, lmod, etag)
}
feed.TranslateURLs(f.FeedLink)
feed.SetMissingDatesTo(time.Now())
return ConvertItems(feed.Items, f), nil
}