I've had a student ask about modelling relationships between collections in mongoose/MongoDB. In relational databases, this process is well-defined: you have to use foreign keys. In MongoDB however, you're given the option to embed one collection in another, and go as deep as you like.
Alternatively, you could use foreign keys, just like with a relational database. Here, I'm going to compare the two approaches.
Sub-Documents
In the diagram above, posts are sub-documents of user documents. Loading a single user's posts is easy, but what if we wanted to get all posts? In MongoDB this is achieved through aggregation.
Aggregation
The MongoDB documentation makes aggregation look tricky but it's actually quite intuitive. To experiment with this, set up a project with the following file:
// index.js
var mongoose = require('mongoose'),
Schema = mongoose.Schema,
uuid = require('uuid');
var postSchema = new Schema({
text: {
type: String,
required: true
}
});
var userSchema = new Schema({
username: {
type: String,
required: true,
unique: true
},
posts: [
postSchema
]
});
var User = mongoose.model('User', userSchema);
var users = [1,2,3,4,5].map(function(_) {
return {
username: uuid.v4(),
posts: [1,2,3,4].map(function(_) {
return {
text: uuid.v4()
}
})
}
});
function generateData(callback) {
User.create(users, function(err) {
if (err) { return console.log(err) }
callback(err);
});
}
mongoose.connect('mongodb://localhost:testmongo/testmongo', function(err) {
// Reset database
var db = mongoose.connection;
db.collection('users').drop();
generateData(function(err) {
if (err) { return console.log(err) }
// -------------------------------------------
// Work here
// -------------------------------------------
// Exit application
// mongoose.disconnect();
});
});
This simply sets up a database with a user schema with some embedded posts. I'm going to provide some snippets now that you should paste in where our script says "Work here".
To begin with, let's execute the simplest aggregate
query possible, with a callback:
User.aggregate([
{$match: {}}
], function(err, users) {
console.log(users);
});
You'll see that we passed an array to User.aggregate
; this array is actually a pipeline of operations to be performed subsequently. The only operation we've performed on our User model is $match
with a blank query object, so all user documents are returned.
We wish to map each of these user objects to their respective posts however, and aggregate these. The operator to achieve this is $unwind
, which switches out each user document with its respective posts here:
User.aggregate([
{$match: {}},
{$unwind: '$posts'}
], function(err, posts) {
console.log(posts);
});
Impressed? $match
and $unwind
are Aggregation Pipeline Operators; there are more, and they can be endlessly stacked on top of eachother. For example, I'll now filter the posts we're aggregating with another $match
:
User.aggregate([
{$match: {}},
{$unwind: '$posts'},
{$match: { 'posts.text': { $regex: /^[0-9].*$/ } }}
], function(err, posts) {
console.log(posts);
});
This returns all posts which have a text
property that begins with a number. Note that in my second match statement I have to specify the field as posts.text
as opposed to just text
: the document-of-reference seems to still be User
at that point.
In fact, that first $match
operator is totally unnecessary:
User.aggregate([
{$unwind: '$posts'},
{$match: { 'posts.text': { $regex: /^[0-9].*$/ } }}
], function(err, posts) {
console.log(posts);
});
Performance Considerations
One major downside to this method is that User documents can grow to an undefined size, and large arrays in MongoDB documents can result in a high CPU-overhead. If users can end up with very large numbers of posts, we're better off keeping our user documents small, and moving posts out into another collection which stores references to users with foreign keys.
Foreign Keys
If we're going to be using foreign keys, we better learn how to do joins as well. MongoDB version 3.2 introduced new aggregation operators including $lookup
, which basically acts as an SQL JOIN
.
I'm going to re-create the same database now but with two seperate collections for Users and Posts. Copy and paste the following script to start with:
// index.js
var mongoose = require('mongoose'),
Schema = mongoose.Schema,
ObjectId = mongoose.Types.ObjectId,
uuid = require('uuid');
var postSchema = new Schema({
text: {
type: String,
required: true
},
user_id: {
type: Schema.Types.ObjectId,
ref: 'User',
required: true
}
});
var userSchema = new Schema({
username: {
type: String,
required: true,
unique: true
}
});
var User = mongoose.model('User', userSchema);
var Post = mongoose.model('Post', postSchema);
var users = [1,2,3,4,5].map(function(_) {
return {
username: uuid.v4(),
_id: new ObjectId()
}
});
var posts = Array.from(new Array(20), function(x,i){i+1}).map(function(_) {
return {
_id: new ObjectId(),
text: uuid.v4(),
user_id: users[Math.floor(Math.random() * 5)]._id
}
});
function generateData(callback) {
User.create(users, function(err) {
if (err) { return console.log(err) }
Post.create(posts, function(err) {
if (err) { return console.log(err) }
callback(err);
});
});
}
mongoose.connect('mongodb://localhost:testmongo/testmongo', function(err) {
// Reset database
var db = mongoose.connection;
db.collection('users').drop();
db.collection('posts').drop();
generateData(function(err) {
if (err) { return console.log(err) }
// -------------------------------------------
// Work here
// -------------------------------------------
// Exit application
// mongoose.disconnect();
});
});
It's important to note above that mongoose.Types.ObjectId
is used to initialise ObjectId's, but in the schema you must use mongoose.Schema.Types.ObjectId
.
Alright, so let's say we wanted to get all of our user documents, but with their relevant post documents embedded. We'll use aggregate
and $lookup
for this:
User.aggregate([
{$lookup: {
from: 'posts',
localField: '_id',
foreignField: 'user_id',
as: 'posts',
}}
], function(err, users) {
console.log(users);
console.log(users[0].posts[0].text);
});
So aggregate
grabs all of our users, then for each user attaches documents from posts
with a user_id
equal to the user's _id
, as posts
. Notice how posts
is lowercase and pluralised even though we registered our model as Post
-- this is a massive gotcha.
What if we wanted all of the posts associated with a particular user? We would use a simple find
operation.
var user_id = users[0]._id;
Post.find({user_id: user_id}, function(err, posts) {
if (err) { return console.log(err) }
console.log(posts);
});
Just in case you haven't seen it before, we can also execute callbacks as follows:
var user_id = users[0]._id;
Post.find({user_id: user_id}).exec(function(err, posts) {
if (err) { return console.log(err) }
console.log(posts);
});
If we wanted a single post along with the user's information, we can attach the associated user's information with another $lookup
:
var post_id = posts[0]._id;
Post.aggregate([
{$match: {_id: post_id}},
{$lookup: {
from: 'users',
localField: 'user_id',
foreignField: '_id',
as: 'author',
}}
], function(err, posts) {
console.log(posts[0]);
});
But there's an opportunity to improve performance here. Ideally, we'd like to just be able to retrieve a single document with all the information we need (without performing a join). We can actually choose to include redundant information in the database to achieve this effect.
A Hybrid Approach
Let's update our postSchema and the way we generate our post test data, so posts store user their associated user's username:
//...
var postSchema = new Schema({
text: {
type: String,
required: true
},
user: {
_id: {
type: Schema.Types.ObjectId,
ref: 'User',
required: true
},
username: {
type: String
}
}
});
//...
var posts = Array.from(new Array(20), function(x,i){i+1}).map(function(_) {
var user = users[Math.floor(Math.random() * 5)];
return {
_id: new ObjectId(),
text: uuid.v4(),
user: {
_id: user._id ,
username: user.username
}
}
});
//...
Now the operation that finds all posts along with their associated user's information is a simple find query:
Post.find({}).exec(function(err, posts) {
if (err) { return console.log(err) }
console.log(posts);
});
...because post documents store the user's details. However, if a user changes their username, we're going to have inconsistencies in our database. To avoid this, we give the user model a pre-save hook, and a post-save hook:
// Add this just before model definitions
userSchema.pre('save', function(next) {
var user = this;
user.needsToUpdatePostsWithNewUsername = user.isModified('username');
next();
});
userSchema.post('save', function(doc, next) {
var user = this;
if (this.needsToUpdatePostsWithNewUsername) {
Post.update({'user._id': user._id}, // Find posts related to this user
{$set: {'user.username': user.username}}, // Update the username
{multi: true}, // Options object (update multiple documents)
function(err) {
if (err) { return next(err) }
next();
});
} else {
next();
}
});
var User = mongoose.model('User', userSchema);
var Post = mongoose.model('Post', postSchema);
The reason we need that pre-save hook is that we actually don't have access to user.isModified
in post-save hooks. Conviently, our user object carries arbitrary properties from the pre-save hook into the post-save hook. In the post-save hook, if the username's been updated, we find all posts associated with this user, and we update them to store the new username.
Now, to make sure this works, we can run the following code which selects the first user, updates their username, then finds posts associated with that username:
User.find({}).exec(function(err, users) {
if (err) { return console.log(err) }
var user = users[0];
user.username = "alex";
user.save(function(err) {
if (err) { return console.log(err) }
Post.find({'user.username': "alex"}).exec(function(err, posts) {
if (err) { return console.log(err) }
console.log(posts);
});
});
});
So that's the ideal way to model this database: with redundancy-by-design.
Following Users
This student would like to know how to model "Users" and "Posts", but where users can follow each other, and where users only see the posts of those who they follow.
I would achieve this effect as follows:
First, change the schema to store users who a user follows:
var userSchema = new Schema({
username: {
type: String,
required: true,
unique: true
},
followingUsers: [
{
user_id: Schema.Types.ObjectId
}
]
});
Correspondingly we'll generate test users as follows:
var users = [1,2,3,4,5].map(function(_) {
return {
username: uuid.v4(),
_id: new ObjectId()
}
});
users = users.map(function(user) {
user.followingUsers = [
{ user_id: users[0]._id }
];
return user;
});
console.log("Everybody follows user with _id:", users[0]._id);
And I'll add a user method to get the posts in a user's feed (these will be posts by users in the given user's followingUsers property):
userSchema.methods.getFeed = function(callback) {
var user = this;
var followingUsersIDs = this.followingUsers.map(function(followingUser) {
return followingUser.user_id;
});
Post.find({'user._id': {$in: followingUsersIDs}}, function(err, posts) {
callback(posts);
});
};
And here's an example of grabbing the first user's feed:
User.find({}).exec(function(err, users) {
if (err) { return console.log(err) }
var user = users[0];
user.getFeed(function(err, posts) {
if (err) { return console.log(err) }
console.log(posts);
});
});
It's that easy.