Categorygithub.com/janeLi-coder1/rock_data
modulepackage
0.0.0-20210303055106-71da478098c1
Repository: https://github.com/janeli-coder1/rock_data.git
Documentation: pkg.go.dev

# README

rock_data

Process rock song play data and import it to postgreSQL DB.

design

1 goroutine for reading from csv file, and specified write goroutines to process rows and insert them to db.

design-structure

build

go mod tidy
go build -o ./rock_data .

run

// clean table and import csv file
$ ./rock_data -file ./data.csv -clean 1

// import csv file without clean table
$ ./rock_data -file ./data.csv

// help
$ ./rock_data -h

configuration

  • config.toml
  1. pipelineNumber - the number of pipeline, define the number of write goroutines.
  2. pipeCapacity - the capacity of each pipeline.
  3. debugSwitch - the switch to print detail log.
  4. database.maxMultiInsertNumber - I use multi-insert to improve efficiency, this field controls max multi-insert number.

postgreSQL schema

CREATE USER jane WITH PASSWORD 'asdf';

CREATE DATABASE classic_rock WITH ENCODING 'UTF8' OWNER='jane';

\c classic_rock;
SET ROLE jane;

Create schema data;
Create table data.play_record (
    id serial primary key,
    song_raw text not null default '',
    song_clean text not null default '',
    artist_raw text not null default '',
    artist_clean text not null default '',
    callsign text not null default '',
    time timestamp with time zone not null default now(),
    unique_id text not null unique default '',
    combined text not null default '',
    is_first bool not null default false,
    create_time timestamp with time zone not null default now(),
    update_time timestamp with time zone not null default now()
);
Comment on column data.play_record.id is 'record id';
Comment on column data.play_record.song_raw is 'song text from radio station';
Comment on column data.play_record.song_clean is 'song title';
Comment on column data.play_record.artist_raw is 'artist text from radio station';
Comment on column data.play_record.artist_clean is 'artist';
Comment on column data.play_record.callsign is 'station callsign';
Comment on column data.play_record.time is 'The time when song is pulled';
Comment on column data.play_record.unique_id is 'unique id';
Comment on column data.play_record.combined is 'unique song text';
Comment on column data.play_record.is_first is 'is first mention of the song';

CREATE OR REPLACE FUNCTION update_timestamp()
RETURNS TRIGGER 
LANGUAGE plpgsql
AS 
$$
BEGIN
    NEW.update_time = now();
    RETURN NEW;
END;
$$;
CREATE TRIGGER trg_update_play_record_timestamp BEFORE UPDATE ON data.play_record FOR EACH ROW EXECUTE PROCEDURE update_timestamp();

# Functions

format each row to array.
No description provided by the author
No description provided by the author
No description provided by the author
No description provided by the author
format one row to struct.
read file path from command line argument.
No description provided by the author
No description provided by the author
No description provided by the author
No description provided by the author
read file content and push to corresponding pipeline.
No description provided by the author
if read goroutine done, check channel length and close it.

# Variables

No description provided by the author
No description provided by the author
No description provided by the author

# Structs

No description provided by the author
No description provided by the author
No description provided by the author
No description provided by the author