-
Notifications
You must be signed in to change notification settings - Fork 0
/
loadTwitterEdges.groovy
49 lines (43 loc) · 1.68 KB
/
loadTwitterEdges.groovy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
// Script to import twitter friends csv into JanusGraph
// Data is from https://www.kaggle.com/hwassner/TwitterFriends
// The dataset is provided under the CC BY-NC-SA 4.0 license https://creativecommons.org/licenses/by-nc-sa/4.0/
// direct download link https://www.kaggle.com/hwassner/TwitterFriends/downloads/data.csv/4, requires login
// Author: Chris Hupman ([email protected]) 07/16/2018
// usage: ./bin/gremlin.sh -e $PWD/loadTwitterFriends.groovy $PWD/data.csv $PWD/janusgraph.properties
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.concurrent.TimeUnit;
FILENAME = args[0];
PROPERTIES = args[1];
// Open or create graph database
graph = JanusGraphFactory.open(PROPERTIES)
// load the data
g = graph.traversal();
g.tx().rollback();
batchSize = 1000000;
lastBatch = 0;
edgeCount = 0;
println 'Reading in file ' + FILENAME;
// Open file, iterate through each line, and set a the line number to count
new File(FILENAME).eachLine { line, count ->
// println "sup";
LinkedList fields = line.split(",")
// userId = Long.valueOf(fields.pop())
userId = fields.get(0)
fields.removeFirst()
user = g.V().has('id', userId).next()
for (id in fields) {
friend = g.V().has('id', id).next()
edge = g.V(user).addE("follows").to(friend).next()
edgeCount++
}
// Check if we exceeded our batchSize and commit if so.
if ((edgeCount - lastBatch) >= batchSize) {
graph.tx().commit();
lastBatch = count + edgeCount;
println "Commit complete. Vertex added count is at " + (count) + " edgeCount is at " + edgeCount;
}
}
// Commit any remaining entries and close the graph
graph.tx().commit();
graph.close();