Skip to content
Snippets Groups Projects
Commit 0209b248 authored by Adam Harrison-Fuller's avatar Adam Harrison-Fuller
Browse files

Added duplication checking from Gitlab


Added duplication detection by adding the RSS items GUID to the description of the created issue.  We run a search before we create each new issue to make sure that we dont already have an issue that contains the GUID.

Signed-off-by: default avatarAdam Harrison-Fuller <adam@adamhf.io>
parent 6bd8187e
Branches
Tags
No related merge requests found
......@@ -2,6 +2,19 @@
Create Gitlab issues from RSS Feeds with optional labelling. Created to monitor RSS feeds and bring posts to
our attention (Security Releases, Product Updates etc)
## Avoiding Duplication
We try to be as clever as is reasonably possible in terms of not duplicating RSS feed items into Gitlab.
A SQLite DB is used to store the GUID/FeedID combination which is checked when assessing articles for synchronisation.
In addition we also add the RSS feed's item GUID at the bottom of the issue description. Before synchronising an RSS item
we run an issue search in the associated project, if we dont find the GUID in any issue we assume its not already been created.
This helps to guard against scenarios where you lose the SQLite DB and dont want RSS items reduplicating into Gitlab.
If found in Gitlab it is marked as syncronised in the local database as well as printing an link to the existing issue(s) to stdout.
## Limiting what is initially synced.
Each feed entry in the config file can have an "added_since" property set. This is used to only sync RSS items that have a
Published/Updated date greater than the provided value. This can be useful on RSS feeds where you dont want to import historic items,
just new posts going forward.
## Config file
The config file **MUST** be named config.yaml, an example one is provided [here](config.yaml.example). Below is a brief
......@@ -35,7 +48,7 @@ Make sure the location of your DATA_DIR environment variable is set to a persist
that is contained within it stores the state of which RSS items have already been synced.
### Run it
```sh
```bash
docker run -e GITLAB_API_TOKEN=<INSERT_TOKEN> -e DATA_DIR=/data -e CONFIG_DIR=/app -v <PATH_TO_DATA_DIR>:/data -v <PATH_TO_CONFIG_DIR>/config adamhf/rss-sync:latest
```
......
......@@ -15,6 +15,7 @@ import (
"net/http"
"os"
"path"
"strings"
"time"
)
......@@ -48,6 +49,33 @@ type EnvValues struct {
GitlabAPIKey string
}
func hasExistingGitlabIssue(guid string, projectID int, gitlabClient *gitlab.Client) bool{
searchOptions := &gitlab.SearchOptions{
Page:1,
PerPage:10,
}
issues, _, err := gitlabClient.Search.IssuesByProject(projectID, guid, searchOptions)
if err != nil {
fmt.Printf("Unable to query Gitlab for existing issues\n")
}
retVal := false
if len(issues) == 1 {
retVal = true
fmt.Printf("Found existing issues for %s in project (%s)\n", guid, issues[0].WebURL)
} else if len(issues) > 1 {
retVal = true
var urls []string
for _, issue := range issues {
urls = append(urls, issue.WebURL)
}
fmt.Printf("Found multiple existing issues for %s in project (%s)\n", guid, strings.Join(urls, ", "))
}
return retVal
}
func (feed Feed) checkFeed(db *gorm.DB, gitlabClient *gitlab.Client) {
fp := gofeed.NewParser()
rss, err := fp.ParseURL(feed.FeedURL)
......@@ -73,14 +101,6 @@ func (feed Feed) checkFeed(db *gorm.DB, gitlabClient *gitlab.Client) {
fmt.Printf("New Items: %d\n", len(newArticle))
for _, item := range newArticle {
// Prefer description over content
var body string
if item.Description != "" {
body = item.Description
} else {
body = item.Content
}
var time *time.Time
// Prefer updated time to published
if item.UpdatedParsed != nil {
......@@ -94,9 +114,24 @@ func (feed Feed) checkFeed(db *gorm.DB, gitlabClient *gitlab.Client) {
continue
}
// Check Gitlab to see if we already have a matching issue there
if hasExistingGitlabIssue(item.GUID, feed.GitlabProjectID, gitlabClient) {
// We think its new but there is already a matching GUID in Gitlab. Mark as Sync'd
db.Create(&SyncedItems{UUID: item.GUID, Feed: feed.ID})
continue
}
// Prefer description over content
var body string
if item.Description != "" {
body = item.Description
} else {
body = item.Content
}
issueOptions := &gitlab.CreateIssueOptions{
Title: gitlab.String(item.Title),
Description: gitlab.String(body),
Description: gitlab.String(body + "\n" + item.GUID),
Labels: feed.Labels,
CreatedAt: time,
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment