From 38e964c5225c621778d9c48ee8e4b46ae156fce5 Mon Sep 17 00:00:00 2001 From: balki <3070606-balki@users.noreply.gitlab.com> Date: Sun, 1 May 2022 15:32:41 -0400 Subject: [PATCH] almost done --- app/app.go | 47 +++++++++++++++++++++--- app/config.go | 5 +++ app/db.go | 97 +++++++++++++++++++++++++++++++++++++++++++++++++ app/download.go | 4 -- app/parser.go | 28 ++++++++++++++ exp/csv/csv.go | 0 exp/csv/main.go | 41 +++++++++++++++++++++ 7 files changed, 213 insertions(+), 9 deletions(-) create mode 100644 app/db.go create mode 100644 app/parser.go create mode 100644 exp/csv/csv.go create mode 100644 exp/csv/main.go diff --git a/app/app.go b/app/app.go index 2b3a01e..be0fb06 100644 --- a/app/app.go +++ b/app/app.go @@ -1,6 +1,11 @@ package app -import "go.balki.me/tss/log" +import ( + "io" + + "go.balki.me/tss/log" + "go.balki.me/tss/proxy" +) func Run(configPath string) { cfg, err := ParseConfig(configPath) @@ -19,7 +24,7 @@ func Run(configPath string) { for _, feed := range cfg.Feeds { log.Info("processing feed", "feed", feed.Name) - ProcessFeed(feed, scheduler) + ProcessFeed(feed, scheduler, cfg.DbDir) } /* @@ -53,19 +58,51 @@ func Run(configPath string) { */ } -func ProcessFeed(feed FeedCfg, scheduler *Scheduler) { +func ProcessFeed(feed FeedCfg, scheduler *Scheduler, dbDir string) { sd, err := scheduler.ShouldDownload(feed.Name, feed.Cron) if err != nil { log.Error("shouldDownload failed", "feed", feed.Name, "err", err) return } + if !sd { log.Info("skipping feed due to schedule", "feed", feed.Name) return } - _, err = Download(feed.Url, feed.Proxy) + + db, err := NewDB(dbDir, feed.Name) if err != nil { - log.Error("download failed", "feed", feed.Name, "url", feed.Url, "proxy", feed.Proxy) + log.Error("failed to get db", "feed", feed.Name, "db_dir", dbDir, "error", err) return } + + data, err := Download(feed.Url, feed.Proxy) + if err != nil { + log.Error("download failed", "feed", feed.Name, "url", feed.Url, "proxy", feed.Proxy, "error", err) + return + } + + entries, err := ParseFeed(data) + if err != nil { + log.Error("feed parsing failed", "feed", feed.Name, "data", data, "error", err) + return + } + + _, err = db.Filter(entries) + if err != nil { + log.Error("failed to filter entries", "feed", feed.Name, "error", err) + } +} + +func Download(url string, proxyUrl string) ([]byte, error) { + client, err := proxy.GetClient(proxyUrl) + if err != nil { + return nil, err + } + res, err := client.Get(url) + if err != nil { + return nil, err + } + defer res.Body.Close() + return io.ReadAll(res.Body) } diff --git a/app/config.go b/app/config.go index ba02325..4aa1889 100644 --- a/app/config.go +++ b/app/config.go @@ -19,6 +19,7 @@ type FeedCfg struct { type Config struct { DataDir string `yaml:"data_dir"` LastSuccessPath string `yaml:"last_loaded_path"` + DbDir string `yaml:"db_dir"` Feeds []FeedCfg `yaml:"feeds"` } @@ -44,5 +45,9 @@ func ParseConfig(configPath string) (*Config, error) { c.LastSuccessPath = path.Join(c.DataDir, "last_success.yml") } + if c.DbDir == "" { + c.DbDir = path.Join(c.DataDir, "feed_data") + } + return &c, nil } diff --git a/app/db.go b/app/db.go new file mode 100644 index 0000000..7575ed7 --- /dev/null +++ b/app/db.go @@ -0,0 +1,97 @@ +package app + +import ( + "encoding/csv" + "fmt" + "os" + "path" + "time" +) + +type Status string + +const ( + Sent Status = "SENT" + Filtered = "FILTERED" + Error = "ERROR" +) + +//default format used by yaml.Marshal +const TimeFormat string = "2006-01-02T15:04:05.999999999-07:00" + +type Record struct { + Time time.Time + Status Status + FeedEntry FeedEntry +} + +type DB interface { + Filter(entries []FeedEntry) ([]FeedEntry, error) + Save([]Record) error +} + +type db struct { + dbPath string + seenLinks map[string]struct{} +} + +func NewDB(storageDir, feedName string) (DB, error) { + dbPath := path.Join(storageDir, fmt.Sprintf("%s.csv", feedName)) + f, err := os.Open(dbPath) + if err != nil { + return nil, err + } + defer f.Close() + reader := csv.NewReader(f) + records, err := reader.ReadAll() + if err != nil { + return nil, fmt.Errorf("failed to parse csv, path:%v, error:%w", dbPath, err) + } + db := db{dbPath: dbPath} + db.seenLinks = map[string]struct{}{} + for _, rec := range records { + var recStatus Status = Status(rec[2]) + if recStatus == Sent || recStatus == Filtered { + db.seenLinks[rec[1]] = struct{}{} + } + } + return &db, nil +} + +func (d *db) Filter(entries []FeedEntry) ([]FeedEntry, error) { + var filteredEntries []FeedEntry + for _, entry := range entries { + if _, ok := d.seenLinks[entry.Link]; !ok { + filteredEntries = append(filteredEntries, entry) + } + } + return filteredEntries, nil +} + +func (d *db) Save(records []Record) error { + f, err := os.OpenFile(d.dbPath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + return err + } + defer f.Close() + csvw := csv.NewWriter(f) + if len(d.seenLinks) == 0 { //New file, write header + csvw.Write([]string{ + "Date", + "Link", + "Status", + "FilteredBy", + "Content", + }) + } + for _, r := range records { + csvw.Write([]string{ + r.Time.Format(TimeFormat), + r.FeedEntry.Link, + string(r.Status), + "-", + r.FeedEntry.Content, + }) + } + return nil +} diff --git a/app/download.go b/app/download.go index 2e6c30a..4879f7a 100644 --- a/app/download.go +++ b/app/download.go @@ -1,5 +1 @@ package app - -func Download(url string, proxy string) ([]byte, error) { - return nil, nil -} diff --git a/app/parser.go b/app/parser.go new file mode 100644 index 0000000..42fab7f --- /dev/null +++ b/app/parser.go @@ -0,0 +1,28 @@ +package app + +import ( + "encoding/xml" +) + +type FeedEntry struct { + Title string `xml:"title"` + Link string `xml:"link"` + Author string `xml:"author"` + Guid string `xml:"guid"` + Description string `xml:"description"` + Content string `xml:",innerxml"` +} + +func ParseFeed(data []byte) ([]FeedEntry, error) { + + v := struct { + Items []FeedEntry `xml:"channel>item"` + }{} + + err := xml.Unmarshal(data, &v) + if err != nil { + return nil, err + } + + return v.Items, nil +} diff --git a/exp/csv/csv.go b/exp/csv/csv.go new file mode 100644 index 0000000..e69de29 diff --git a/exp/csv/main.go b/exp/csv/main.go new file mode 100644 index 0000000..44e1a69 --- /dev/null +++ b/exp/csv/main.go @@ -0,0 +1,41 @@ +package main + +import ( + "bytes" + "encoding/csv" + "fmt" + "time" + + "gopkg.in/yaml.v3" +) + +func main() { + fmt.Println("vim-go") + fb := bytes.NewReader(nil) + cr := csv.NewReader(fb) + records, err := cr.ReadAll() + fmt.Println(records, err) + fmt.Println(time.Now().String()) + yesterday := time.Now().Add(-24 * time.Hour) + m := map[string]time.Time{ + "Bala": time.Now(), + "Linus": yesterday, + } + data, _ := yaml.Marshal(&m) + fmt.Printf("%s\n", data) + //format := "2022-05-01T15:08:20.593630746-04:00" + format := "2006-01-02T15:04:05.999999999-07:00" + fmt.Println("============") + fmt.Println(yesterday.Format(format)) + fmt.Println("============") + fmt.Println(yesterday.GoString()) +} + +/* + +Linus: + +============ +2022-04-30T15:14:40.302916106-04:00 +2022-04-30T15:14:40.302916106-04:00 +*/