I am dealing with a bit over 15 billion rows of data in various text files. I am trying to insert them into MariaDB using golang. Golang is a fast language and is often used for big data but I cannot get more than 10k-15k inserts a second, at this rate its gonna take over 15 days, I need this data imported sooner than that. I have tried various batch sizes but they all give about the same results.
function I'm using to handle file data:
func handlePath(path string) {
file, err := os.Open(path)
if err != nil {
fmt.Printf("error opening %v: %v", path, err)
return
}
defer file.Close()
scanner := bufio.NewScanner(file)
var temp_lines []string
for scanner.Scan() {
if len(temp_lines) == line_batch {
insertRows(temp_lines)
temp_lines = []string{}
}
temp_lines = append(temp_lines, scanner.Text())
}
insertRows(temp_lines)
fmt.Printf("\nFormatted %v\n", path)
if err := scanner.Err(); err != nil {
fmt.Printf("\nScanner error %v\n", err)
return
}
}
function I'm using for inserting:
func insertRows(rows []string) {
var Args []string
for _, row := range rows {
line_split := strings.Split(row, "|")
if len(line_split) != 6 {return}
database_id := line_split[0]
email := line_split[1]
password := line_split[2]
username := line_split[3]
ip := line_split[4]
phone := line_split[5]
arg := fmt.Sprintf("('%v','%v','%v','%v','%v','%v')",database_id,email,password,username,ip,phone)
Args = append(Args, arg)
}
sqlQuery := fmt.Sprintf("INSERT INTO new_table (database_id, email, password, username, ip, phone_number) VALUES %s", strings.Join(Args, ","))
_, err := db.Exec(sqlQuery)
if err != nil {
//fmt.Printf("%v\n", err)
return
}
total+=line_batch
writes++
}
Server specs:
server
I am currently in the process of creating a little Go App. Right now I am working on the DB part. The Library I use is this one: https://github.com/jackc/pgx
The problem I have is that every time I try to execute the database read, it tells me that my 'conn is busy'. I read about using a pgxpool instead of a single connection, but it still does not work. What am I doing wrong?
func (postgre *PostgreClient) read(query string) (pgx.Row, error) {
client, err := postgre.client.Acquire(context.TODO())
transaction, err := client.BeginTx(context.TODO(), pgx.TxOptions{})
if err != nil {
return nil, err
}
defer transaction.Rollback(context.TODO())
rows := transaction.QueryRow(context.TODO(), query)
if err != nil {
return nil, err
}
err = transaction.Commit(context.TODO())
return rows, err
}
Thanks in advance.
You have to scan the row before you commit the transaction.
If you want the handling of the transaction to remain within the function you can pass an interface that does the scanning also inside the function.
For example:
// implemented by *sql.Row & *sql.Rows
type Row interface {
Scan(dst ...interface{}) error
}
// implemented by your "models"
type RowScanner interface {
ScanRow(r Row) error
}
type User struct {
Id int
Email string
}
func (u *User) ScanRow(r Row) error {
return r.Scan(
&u.Id,
&u.Email,
)
}
func (postgre *PostgreClient) read(query string, rs RowScanner) (err error) {
conn, err := postgre.client.Acquire(context.TODO())
if err != nil {
return err
}
defer conn.Release()
tx, err := conn.BeginTx(context.TODO(), pgx.TxOptions{})
if err != nil {
return err
}
defer func() {
if err != nil {
tx.Rollback(context.TODO())
} else {
tx.Commit(context.TODO())
}
}()
row := tx.QueryRow(context.TODO(), query)
if err != nil {
return nil, err
}
return rs.ScanRow(row)
}
u := new(User)
if err := pg.read("select id, email from users limit 1", u); err != nil {
panic(err)
}
For scanning a list of models:
type UserList []*User
func (ul *UserList) ScanRow(r Row) error {
u := new(User)
if err := u.ScanRow(r); err != nil {
return err
}
*ul = append(*ul, u)
return nil
}
func (postgre *PostgreClient) list(query string, rs RowScanner) (err error) {
conn, err := postgre.client.Acquire(context.TODO())
if err != nil {
return err
}
defer conn.Release()
tx, err := conn.BeginTx(context.TODO(), pgx.TxOptions{})
if err != nil {
return err
}
defer func() {
if err != nil {
tx.Rollback(context.TODO())
} else {
tx.Commit(context.TODO())
}
}()
rows, err := tx.Query(context.TODO(), query)
if err != nil {
return err
}
defer rows.Close()
for rows.Next() {
if err := rs.ScanRow(rows); err != nil {
return err
}
}
return rows.Err()
}
ul := new(UserList)
if err := pg.list("select id, email from users", ul); err != nil {
panic(err)
}
I have a test suite that pollute my database using a seed read from a YAML file.
I'm wondering is there a way to clean my database (delete all records used for the test suite) after running my tests.
// Open db and returns pointer and closer func
func prepareMySQLDB(t *testing.T) (db *sql.DB, closer func() error) {
db, err := sql.Open("mysql", "user:pass#/database")
if err != nil {
t.Fatalf("open mysql connection: %s", err)
}
return db, db.Close
}
// Pollute my database
func polluteDb(db *sql.DB, t *testing.T) {
seed, err := os.Open("seed.yml")
if err != nil {
t.Fatalf("failed to open seed file: %s", err)
}
defer seed.Close()
p := polluter.New(polluter.MySQLEngine(db))
if err := p.Pollute(seed); err != nil {
t.Fatalf("failed to pollute: %s", err)
}
}
func TestAllUsers(t *testing.T) {
t.Parallel()
db, closeDb := prepareMySQLDB(t)
defer closeDb()
polluteDb(db, t)
users, err := AllUsersD(db)
if err != nil {
t.Fatal("AllUsers() failed")
}
got := users[0].Email
if got != "myemail#gmail.com" {
t.Errorf("AllUsers().Email = %s; want myemail#gmail.com", got)
}
got1 := len(users)
if got1 != 1 {
t.Errorf("len(AllUsers()) = %d; want 1", got1)
}
}
// Test I'm interested in
func TestAddUser(t *testing.T) {
t.Parallel()
db, closeDb := prepareMySQLDB(t)
defer closeDb()
polluteDb(db, t)
user, err := AddUser(...)
if err != nil {
t.Fatal("AddUser() failed")
}
//how can I clean my database after this?
}
Should I retrieve the last ID inserted in TestAddUser() and just delete that line manually or there's any other way to save my database state and retrieve it after?
As I said I'm new to Go so any other comments on my code or what so ever are strongly appreciated.
The best way is usually to use a transaction, then ROLLBACK, so they are never committed in the first place.
The github.com/DATA-DOG/go-txdb package can help a lot with that.
Final code:
import (
"database/sql"
"os"
"testing"
txdb "github.com/DATA-DOG/go-txdb"
"github.com/romanyx/polluter"
)
//mostly sql tests
func init() {
txdb.Register("txdb", "mysql", "root:root#/betell_rest")
}
func TestAddUser(t *testing.T) {
db, err := sql.Open("txdb", "root:root#/betell_rest")
if err != nil {
t.Fatal(err)
}
defer db.Close()
users, _ := AllUsers(db)
userscount := len(users)
err = AddUser(db, "bla#gmail.com", "pass")
if err != nil {
t.Fatal("AddUser() failed")
}
users, _ = AllUsers(db)
if (userscount + 1) != len(users) {
t.Fatal("AddUser() failed to write in database")
}
}
Note: Also you can pass db into your polluter so you don't affect your database at all.
I'm trying to run multiple commands through ssh but seems that Session.Run allows only one command per session ( unless I'm wrong). I'm wondering how can I bypass this limitation and reuse the session or send a sequence of commands.
The reason is that I need to run sudo su within the same session with the next command ( sh /usr/bin/myscript.sh )
Session.Shell allows for more than one command to be run, by passing your commands in via session.StdinPipe().
Be aware that using this approach will make your life more complicated; instead of having a one-shot function call that runs the command and collects the output once it's complete, you'll need to manage your input buffer (don't forget a \n at the end of a command), wait for output to actually come back from the SSH server, then deal with that output appropriately (if you had multiple commands in flight and want to know what output belongs to what input, you'll need to have a plan to figure that out).
stdinBuf, _ := session.StdinPipe()
err := session.Shell()
stdinBuf.Write([]byte("cd /\n"))
// The command has been sent to the device, but you haven't gotten output back yet.
// Not that you can't send more commands immediately.
stdinBuf.Write([]byte("ls\n"))
// Then you'll want to wait for the response, and watch the stdout buffer for output.
While for your specific problem, you can easily run sudo /path/to/script.sh, it shock me that there wasn't a simple way to run multiple commands on the same session, so I came up with a bit of a hack, YMMV:
func MuxShell(w io.Writer, r io.Reader) (chan<- string, <-chan string) {
in := make(chan string, 1)
out := make(chan string, 1)
var wg sync.WaitGroup
wg.Add(1) //for the shell itself
go func() {
for cmd := range in {
wg.Add(1)
w.Write([]byte(cmd + "\n"))
wg.Wait()
}
}()
go func() {
var (
buf [65 * 1024]byte
t int
)
for {
n, err := r.Read(buf[t:])
if err != nil {
close(in)
close(out)
return
}
t += n
if buf[t-2] == '$' { //assuming the $PS1 == 'sh-4.3$ '
out <- string(buf[:t])
t = 0
wg.Done()
}
}
}()
return in, out
}
func main() {
config := &ssh.ClientConfig{
User: "kf5",
Auth: []ssh.AuthMethod{
ssh.Password("kf5"),
},
}
client, err := ssh.Dial("tcp", "127.0.0.1:22", config)
if err != nil {
panic(err)
}
defer client.Close()
session, err := client.NewSession()
if err != nil {
log.Fatalf("unable to create session: %s", err)
}
defer session.Close()
modes := ssh.TerminalModes{
ssh.ECHO: 0, // disable echoing
ssh.TTY_OP_ISPEED: 14400, // input speed = 14.4kbaud
ssh.TTY_OP_OSPEED: 14400, // output speed = 14.4kbaud
}
if err := session.RequestPty("xterm", 80, 40, modes); err != nil {
log.Fatal(err)
}
w, err := session.StdinPipe()
if err != nil {
panic(err)
}
r, err := session.StdoutPipe()
if err != nil {
panic(err)
}
in, out := MuxShell(w, r)
if err := session.Start("/bin/sh"); err != nil {
log.Fatal(err)
}
<-out //ignore the shell output
in <- "ls -lhav"
fmt.Printf("ls output: %s\n", <-out)
in <- "whoami"
fmt.Printf("whoami: %s\n", <-out)
in <- "exit"
session.Wait()
}
If your shell prompt doesn't end with $ ($ followed by a space), this will deadlock, hence why it's a hack.
NewSession is a method of a connection. You don't need to create a new connection each time. A Session seems to be what this library calls a channel for the client, and many channels are multiplexed in a single connection. Hence:
func executeCmd(cmd []string, hostname string, config *ssh.ClientConfig) string {
conn, err := ssh.Dial("tcp", hostname+":8022", config)
if err != nil {
log.Fatal(err)
}
defer conn.Close()
var stdoutBuf bytes.Buffer
for _, command := range cmd {
session, err := conn.NewSession()
if err != nil {
log.Fatal(err)
}
defer session.Close()
session.Stdout = &stdoutBuf
session.Run(command)
}
return hostname + ": " + stdoutBuf.String()
}
So you open a new session(channel) and you run command within the existing ssh connection but with a new session(channel) each time.
You can use a small trick: sh -c 'cmd1&&cmd2&&cmd3&&cmd4&&etc..'
This is a single command, the actual commands are passed as argument to the shell which will execute them. This is how Docker handles multiple commands.
This works for me.
package main
import (
"fmt"
"golang.org/x/crypto/ssh"
// "io"
"log"
"os"
// Uncomment to store output in variable
//"bytes"
)
type MachineDetails struct {
username, password, hostname, port string
}
func main() {
h1 := MachineDetails{"root", "xxxxx", "x.x.x.x", "22"}
// Uncomment to store output in variable
//var b bytes.Buffer
//sess.Stdout = &b
//sess.Stderr = &b
commands := []string{
"pwd",
"whoami",
"echo 'bye'",
"exit",
}
connectHost(h1, commands)
// Uncomment to store in variable
//fmt.Println(b.String())
}
func connectHost(hostParams MachineDetails, commands []string) {
// SSH client config
config := &ssh.ClientConfig{
User: hostParams.username,
Auth: []ssh.AuthMethod{
ssh.Password(hostParams.password),
},
// Non-production only
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
}
// Connect to host
client, err := ssh.Dial("tcp", hostParams.hostname+":"+hostParams.port, config)
if err != nil {
log.Fatal(err)
}
defer client.Close()
// Create sesssion
sess, err := client.NewSession()
if err != nil {
log.Fatal("Failed to create session: ", err)
}
defer sess.Close()
// Enable system stdout
// Comment these if you uncomment to store in variable
sess.Stdout = os.Stdout
sess.Stderr = os.Stderr
// StdinPipe for commands
stdin, err := sess.StdinPipe()
if err != nil {
log.Fatal(err)
}
// Start remote shell
err = sess.Shell()
if err != nil {
log.Fatal(err)
}
// send the commands
for _, cmd := range commands {
_, err = fmt.Fprintf(stdin, "%s\n", cmd)
if err != nil {
log.Fatal(err)
}
}
// Wait for sess to finish
err = sess.Wait()
if err != nil {
log.Fatal(err)
}
// return sess, stdin, err
}
func createSession() {
}
Really liked OneOfOne's answer which inspired me with a more generalized solution to taken a variable that could match the tail of the read bytes and break the blocking read (also no need to fork two extra threads for blocking read and writes). The known limitation is (as in the original solution) if the matching string comes after 64 * 1024 bytes, then this code will spin forever.
package main
import (
"fmt"
"golang.org/x/crypto/ssh"
"io"
"log"
)
var escapePrompt = []byte{'$', ' '}
func main() {
config := &ssh.ClientConfig{
User: "dummy",
Auth: []ssh.AuthMethod{
ssh.Password("dummy"),
},
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
}
client, err := ssh.Dial("tcp", "127.0.0.1:22", config)
if err != nil {
panic(err)
}
defer client.Close()
session, err := client.NewSession()
if err != nil {
log.Fatalf("unable to create session: %s", err)
}
defer session.Close()
modes := ssh.TerminalModes{
ssh.ECHO: 0, // disable echoing
ssh.TTY_OP_ISPEED: 14400, // input speed = 14.4kbaud
ssh.TTY_OP_OSPEED: 14400, // output speed = 14.4kbaud
}
if err := session.RequestPty("xterm", 80, 40, modes); err != nil {
log.Fatal(err)
}
w, err := session.StdinPipe()
if err != nil {
panic(err)
}
r, err := session.StdoutPipe()
if err != nil {
panic(err)
}
if err := session.Start("/bin/sh"); err != nil {
log.Fatal(err)
}
readUntil(r, escapePrompt) //ignore the shell output
write(w, "ls -lhav")
out, err := readUntil(r, escapePrompt)
fmt.Printf("ls output: %s\n", *out)
write(w, "whoami")
out, err = readUntil(r, escapePrompt)
fmt.Printf("whoami: %s\n", *out)
write(w, "exit")
session.Wait()
}
func write(w io.WriteCloser, command string) error {
_, err := w.Write([]byte(command + "\n"))
return err
}
func readUntil(r io.Reader, matchingByte []byte) (*string, error) {
var buf [64 * 1024]byte
var t int
for {
n, err := r.Read(buf[t:])
if err != nil {
return nil, err
}
t += n
if isMatch(buf[:t], t, matchingByte) {
stringResult := string(buf[:t])
return &stringResult, nil
}
}
}
func isMatch(bytes []byte, t int, matchingBytes []byte) bool {
if t >= len(matchingBytes) {
for i := 0; i < len(matchingBytes); i++ {
if bytes[t - len(matchingBytes) + i] != matchingBytes[i] {
return false
}
}
return true
}
return false
}
get inspiration from this
i spent several days and that answer inspires me to try about using sdtin to run multiple commands, finally succeed. and i want to say i dont know golang at all , hence it may be redundant ,but the code works.
if _, err := w.Write([]byte("sys\r")); err != nil {
panic("Failed to run: " + err.Error())
}
if _, err := w.Write([]byte("wlan\r")); err != nil {
panic("Failed to run: " + err.Error())
}
if _, err := w.Write([]byte("ap-id 2099\r")); err != nil {
panic("Failed to run: " + err.Error())
}
if _, err := w.Write([]byte("ap-group xuebao-free\r")); err != nil {
panic("Failed to run: " + err.Error())
}
if _, err := w.Write([]byte("y\r")); err != nil {
panic("Failed to run: " + err.Error())
}
its function is the same asterminal operation
here is the whole code:
/* switch ssh
*/
package main
import (
"flag"
"fmt"
"io"
"log"
"net"
"os"
"strings"
"sync"
)
import (
"golang.org/x/crypto/ssh"
)
func main() {
//go run ./testConfig.go --username="aaa" --passwd='aaa' --ip_port="192.168.6.87" --cmd='display version'
username := flag.String("username", "aaa", "username")
passwd := flag.String("passwd", "aaa", "password")
ip_port := flag.String("ip_port", "1.1.1.1:22", "ip and port")
cmdstring := flag.String("cmd", "display arp statistics all", "cmdstring")
flag.Parse()
fmt.Println("username:", *username)
fmt.Println("passwd:", *passwd)
fmt.Println("ip_port:", *ip_port)
fmt.Println("cmdstring:", *cmdstring)
config := &ssh.ClientConfig{
User: *username,
Auth: []ssh.AuthMethod{
ssh.Password(*passwd),
},
Config: ssh.Config{
Ciphers: []string{"aes128-cbc", "aes128-ctr"},
},
HostKeyCallback: func(hostname string, remote net.Addr, key ssh.PublicKey) error {
return nil
},
}
// config.Config.Ciphers = append(config.Config.Ciphers, "aes128-cbc")
clinet, err := ssh.Dial("tcp", *ip_port, config)
checkError(err, "connet "+*ip_port)
session, err := clinet.NewSession()
defer session.Close()
checkError(err, "creae shell")
modes := ssh.TerminalModes{
ssh.ECHO: 1, // disable echoing
ssh.TTY_OP_ISPEED: 14400, // input speed = 14.4kbaud
ssh.TTY_OP_OSPEED: 14400, // output speed = 14.4kbaud
}
if err := session.RequestPty("vt100", 80, 40, modes); err != nil {
log.Fatal(err)
}
w, err := session.StdinPipe()
if err != nil {
panic(err)
}
r, err := session.StdoutPipe()
if err != nil {
panic(err)
}
e, err := session.StderrPipe()
if err != nil {
panic(err)
}
in, out := MuxShell(w, r, e)
if err := session.Shell(); err != nil {
log.Fatal(err)
}
<-out //ignore the shell output
in <- *cmdstring
fmt.Printf("%s\n", <-out)
if _, err := w.Write([]byte("sys\r")); err != nil {
panic("Failed to run: " + err.Error())
}
if _, err := w.Write([]byte("wlan\r")); err != nil {
panic("Failed to run: " + err.Error())
}
if _, err := w.Write([]byte("ap-id 2099\r")); err != nil {
panic("Failed to run: " + err.Error())
}
if _, err := w.Write([]byte("ap-group xuebao-free\r")); err != nil {
panic("Failed to run: " + err.Error())
}
if _, err := w.Write([]byte("y\r")); err != nil {
panic("Failed to run: " + err.Error())
}
in <- "quit"
_ = <-out
session.Wait()
}
func checkError(err error, info string) {
if err != nil {
fmt.Printf("%s. error: %s\n", info, err)
os.Exit(1)
}
}
func MuxShell(w io.Writer, r, e io.Reader) (chan<- string, <-chan string) {
in := make(chan string, 5)
out := make(chan string, 5)
var wg sync.WaitGroup
wg.Add(1) //for the shell itself
go func() {
for cmd := range in {
wg.Add(1)
w.Write([]byte(cmd + "\n"))
wg.Wait()
}
}()
go func() {
var (
buf [1024 * 1024]byte
t int
)
for {
n, err := r.Read(buf[t:])
if err != nil {
fmt.Println(err.Error())
close(in)
close(out)
return
}
t += n
result := string(buf[:t])
if strings.Contains(string(buf[t-n:t]), "More") {
w.Write([]byte("\n"))
}
if strings.Contains(result, "username:") ||
strings.Contains(result, "password:") ||
strings.Contains(result, ">") {
out <- string(buf[:t])
t = 0
wg.Done()
}
}
}()
return in, out
}
The following code works for me.
func main() {
key, err := ioutil.ReadFile("path to your key file")
if err != nil {
panic(err)
}
signer, err := ssh.ParsePrivateKey([]byte(key))
if err != nil {
panic(err)
}
config := &ssh.ClientConfig{
User: "ubuntu",
Auth: []ssh.AuthMethod{
ssh.PublicKeys(signer),
},
}
client, err := ssh.Dial("tcp", "52.91.35.179:22", config)
if err != nil {
panic(err)
}
session, err := client.NewSession()
if err != nil {
panic(err)
}
defer session.Close()
session.Stdout = os.Stdout
session.Stderr = os.Stderr
session.Stdin = os.Stdin
session.Shell()
session.Wait()
}