[Rails] has_oneで最新のレコードを取得できるようにしてeager_loadしたい
1:N
な関係でN個の中で最新のレコードだけ欲しい、っていうケースで
has_manyなrelationのfirstを取得していくとN+1になりまくってつらいので
has_oneで取得できるようにしてeager_loadする方法を調査したメモ
環境
- OS: Ubuntu 20.04
- Ruby: 3.0.2
- Rails: 6.1.4.1
- RDB: Postgres 13
薄いプロジェクトを作る
rails new hasoneload --database=postgresql -B -M -P -C -S -J --skip-yarn
環境整備
Dockerfile
FROM ruby:3.0.2-alpine3.12 as ruby
## Development
FROM ruby AS dev
ENV BUNDLE_FORCE_RUBY_PLATFORM=1 \
CFLAGS="-Wno-cast-function-type"
RUN apk update \
&& apk add --no-cache \
gcc \
g++ \
libc-dev \
linux-headers \
make \
postgresql \
tzdata \
git \
&& apk add --virtual build-dependencies --no-cache \
postgresql-dev \
libxml2-dev \
build-base \
curl-dev \
# https://dustri.org/b/error-loading-shared-library-ld-linux-x86-64so2-on-alpine-linux.html
&& ln -s /lib/libc.musl-x86_64.so.1 /lib/ld-linux-x86-64.so.2
RUN mkdir -p /app
ENV HOME /app
WORKDIR $HOME
COPY ./Gemfile* ./
ARG bundle_install_options="--without development test doc --jobs 4"
RUN bundle config github.https true \
&& bundle install $bundle_install_options \
&& apk del build-dependencies
COPY . /app
CMD ["bundle", "exec", "puma", "-C", "config/puma.rb"]
docker-compose.yml
version: "3.7"
x-backend-envs: &backend-envs
TZ: Asia/Tokyo
RAILS_LOG_TO_STDOUT: 1
x-backend-build: &backend-build
context: .
args:
bundle_install_options: --jobs 10
target: dev
cache_from:
- hasoneload_backend
services:
postgres:
image: postgres:13.2-alpine
container_name: postgres
environment:
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=postgres
- POSTGRES_HOST=postgres
- POSTGRES_DB=dev
- POSTGRES_DB_TEST=test
ports:
- "5433:5432"
volumes:
- ./postgres/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d
- pgdata:/var/lib/postgresql/data
backend:
container_name: backend
build: *backend-build
volumes:
- ./:/app:cached
environment: *backend-envs
ports:
- 3001:3000
tty: true
command: /bin/sh -l -c 'bundle && bundle exec puma -C config/puma.rb'
depends_on:
- postgres
volumes:
pgdata:
driver: local
config/database.yml
default: &default
adapter: postgresql
encoding: unicode
pool: <%= ENV.fetch("RAILS_MAX_THREADS") { 5 } %>
host: postgres
port: 5432
username: postgres
password: postgres
development:
<<: *default
database: dev
test:
<<: *default
database: test
production:
<<: *default
database: prod
username: postgres
password: <%= ENV['HASONELOAD_DATABASE_PASSWORD'] %>
起動確認
❯ docker-compose run --rm backend sh
WARNING: Native build is an experimental feature and could change at any time
Starting postgres ... done
Creating hasoneload_backend_run ... done
/app #
とりあえずOK
testをRSpecにしておく
テスト書きたいのでRSpecに変えておきます
newする時にtestをskipしておけばよかった
...
group :development, :test do
gem 'rspec-rails'
gem 'factory_bot_rails'
end
...
/app # bundle
...
/app # bin/rails g rspec:install
Running via Spring preloader in process 77
create .rspec
create spec
create spec/spec_helper.rb
create spec/rails_helper.rb
テーブル・モデルの準備
今回は User
と Post
を 1:N で紐付ける状態にします。
class CreateUsers < ActiveRecord::Migration[6.1]
def change
create_table :users do |t|
t.string :name, null: false
t.timestamps
end
end
end
class CreatePosts < ActiveRecord::Migration[6.1]
def change
create_table :posts do |t|
t.references :user
t.string :title, null: false
t.datetime :posted_at, null: false
t.timestamps
end
end
end
こんな感じにしておきます。
posted_at
が最新のレコードを取得したい
テストを書いてみる
require 'rails_helper'
RSpec.describe User, type: :model do
describe "#latest_post" do
let!(:user) { create(:user) }
let!(:post_1) { create(:post, user: user, posted_at: Time.zone.now) }
let!(:post_2) { create(:post, user: user, posted_at: Time.zone.now + 1.day) }
let!(:post_3) { create(:post, user: user, posted_at: Time.zone.now - 1.day) }
subject { user.latest_post }
it { is_expected.to eq post_2 }
end
end
やりたいこととしてはこんな感じ
メソッドで書いてみる
class User < ApplicationRecord
has_many :posts
def latest_post
posts.order(posted_at: :desc).first
end
end
これは当然テストは通るけど、複数のユーザーに対してすべてlatest_postが欲しいとなると 毎回DBに問い合わせる事になる
has_oneで書いてみる order編
こちらを参考にさせていただきました
class User < ApplicationRecord
has_many :posts
has_one :latest_post, -> { order(posted_at: :desc) }, class_name: :Post
end
これもテストは通ります
複数ユーザーがいる場合のテストケースを追加してみます
require 'rails_helper'
RSpec.describe User, type: :model do
describe "#latest_post" do
let!(:user) { create(:user) }
let!(:post_1) { create(:post, user: user, posted_at: Time.zone.now) }
let!(:post_2) { create(:post, user: user, posted_at: Time.zone.now + 1.day) }
let!(:post_3) { create(:post, user: user, posted_at: Time.zone.now - 1.day) }
context "when single user" do
subject { user.latest_post }
it { is_expected.to eq post_2 }
end
context "when exists other user" do
let!(:other_user) { create(:user) }
let!(:other_posts) { create_list(:post, 10, user: other_user) }
let!(:other_post_1) { create(:post, user: other_user, posted_at: Time.zone.now + 1.day) }
let!(:other_post_2) { create(:post, user: other_user, posted_at: Time.zone.now - 1.day) }
let(:users) { User.all.order(:id).includes(:latest_post) }
subject { users.map(&:latest_post) }
it { is_expected.to eq [post_2, other_post_1] }
end
end
end
これも通りました
よさそう?
SQLを見てみる
rails consoleから実際に発行されているSQLを見てみます
先にUserを2人作成しておきます
irb(main):002:0> 2.times{ |i| User.create(name: "#{i+1}")}
TRANSACTION (0.1ms) BEGIN
User Create (0.5ms) INSERT INTO "users" ("name", "created_at", "updated_at") VALUES ($1, $2, $3) RETURNING "id" [["name", "1"], ["created_at", "2021-08-31 13:56:58.718633"], ["updated_at", "2021-08-31 13:56:58.718633"]]
TRANSACTION (2.3ms) COMMIT
TRANSACTION (0.1ms) BEGIN
User Create (0.2ms) INSERT INTO "users" ("name", "created_at", "updated_at") VALUES ($1, $2, $3) RETURNING "id" [["name", "2"], ["created_at", "2021-08-31 13:56:58.724841"], ["updated_at", "2021-08-31 13:56:58.724841"]]
TRANSACTION (0.5ms) COMMIT
=> 2
irb(main):003:0> User.all
User Load (0.6ms) SELECT "users".* FROM "users"
=> [#<User:0x000055aa3f9add98 id: 1, name: "1", created_at: Tue, 31 Aug 2021 13:56:58.718633000 UTC +00:00, updated_at: Tue, 31 Aug 2021 13:56:58.718633000 UTC +00:00>, #<User:0x000055aa3f9adb40 id: 2, name: "2", created_at: Tue, 31 Aug 2021 13:56:58.724841000 UTC +00:00, updated_at: Tue, 31 Aug 2021 13:56:58.724841000 UTC +00:00>]
includesした時のクエリを見てみます
irb(main):004:0> User.all.includes(:latest_post)
User Load (0.5ms) SELECT "users".* FROM "users"
Post Load (0.7ms) SELECT "posts".* FROM "posts" WHERE "posts"."user_id" IN ($1, $2) ORDER BY "posts"."posted_at" DESC [["user_id", 1], ["user_id", 2]]
SELECT "posts".* FROM "posts" WHERE "posts"."user_id" IN ($1, $2) ORDER BY "posts"."posted_at" DESC [["user_id", 1], ["user_id", 2]]
これは…
該当ユーザーのpostsを全部取ってきて最初の1個を使う、みたいな感じになりそう?
ユーザーのpostsの数が多い時にちょっと問題になりそうな予感がします
postsを各ユーザーに対して100件作ってクエリ投げてみました
postgres@0:dev> SELECT "posts".* FROM "posts" WHERE "posts"."user_id" IN (1, 2) ORDER BY "posts"."posted_at" DESC;
+------+-----------+-------------+----------------------------+----------------------------+----------------------------+
| id | user_id | title | posted_at | created_at | updated_at |
|------+-----------+-------------+----------------------------+----------------------------+----------------------------|
| 200 | 2 | 2 title 100 | 2021-12-08 14:05:45.185371 | 2021-08-31 14:05:45.185689 | 2021-08-31 14:05:45.185689 |
| 100 | 1 | 1 title 100 | 2021-12-08 14:05:45.004113 | 2021-08-31 14:05:45.004432 | 2021-08-31 14:05:45.004432 |
| 199 | 2 | 2 title 99 | 2021-12-07 14:05:45.183623 | 2021-08-31 14:05:45.183981 | 2021-08-31 14:05:45.183981 |
| 99 | 1 | 1 title 99 | 2021-12-07 14:05:45.002313 | 2021-08-31 14:05:45.002635 | 2021-08-31 14:05:45.002635 |
| 198 | 2 | 2 title 98 | 2021-12-06 14:05:45.181685 | 2021-08-31 14:05:45.182037 | 2021-08-31 14:05:45.182037 |
| 98 | 1 | 1 title 98 | 2021-12-06 14:05:45.000227 | 2021-08-31 14:05:45.000641 | 2021-08-31 14:05:45.000641 |
| 197 | 2 | 2 title 97 | 2021-12-05 14:05:45.179828 | 2021-08-31 14:05:45.18014 | 2021-08-31 14:05:45.18014 |
| 97 | 1 | 1 title 97 | 2021-12-05 14:05:44.998281 | 2021-08-31 14:05:44.998604 | 2021-08-31 14:05:44.998604 |
| 196 | 2 | 2 title 96 | 2021-12-04 14:05:45.178037 | 2021-08-31 14:05:45.178365 | 2021-08-31 14:05:45.178365 |
| 96 | 1 | 1 title 96 | 2021-12-04 14:05:44.996494 | 2021-08-31 14:05:44.996805 | 2021-08-31 14:05:44.996805 |
| 195 | 2 | 2 title 95 | 2021-12-03 14:05:45.176172 | 2021-08-31 14:05:45.176527 | 2021-08-31 14:05:45.176527 |
| 95 | 1 | 1 title 95 | 2021-12-03 14:05:44.994745 | 2021-08-31 14:05:44.995057 | 2021-08-31 14:05:44.995057 |
| 194 | 2 | 2 title 94 | 2021-12-02 14:05:45.174367 | 2021-08-31 14:05:45.174686 | 2021-08-31 14:05:45.174686 |
| 94 | 1 | 1 title 94 | 2021-12-02 14:05:44.992985 | 2021-08-31 14:05:44.993298 | 2021-08-31 14:05:44.993298 |
| 193 | 2 | 2 title 93 | 2021-12-01 14:05:45.172576 | 2021-08-31 14:05:45.172896 | 2021-08-31 14:05:45.172896 |
| 93 | 1 | 1 title 93 | 2021-12-01 14:05:44.991167 | 2021-08-31 14:05:44.99148 | 2021-08-31 14:05:44.99148 |
| 192 | 2 | 2 title 92 | 2021-11-30 14:05:45.170765 | 2021-08-31 14:05:45.171091 | 2021-08-31 14:05:45.171091 |
| 92 | 1 | 1 title 92 | 2021-11-30 14:05:44.989329 | 2021-08-31 14:05:44.989643 | 2021-08-31 14:05:44.989643 |
| 191 | 2 | 2 title 91 | 2021-11-29 14:05:45.168932 | 2021-08-31 14:05:45.169249 | 2021-08-31 14:05:45.169249 |
| 91 | 1 | 1 title 91 | 2021-11-29 14:05:44.987351 | 2021-08-31 14:05:44.987751 | 2021-08-31 14:05:44.987751 |
| 190 | 2 | 2 title 90 | 2021-11-28 14:05:45.167161 | 2021-08-31 14:05:45.16748 | 2021-08-31 14:05:45.16748 |
| 90 | 1 | 1 title 90 | 2021-11-28 14:05:44.98522 | 2021-08-31 14:05:44.985693 | 2021-08-31 14:05:44.985693 |
| 189 | 2 | 2 title 89 | 2021-11-27 14:05:45.165312 | 2021-08-31 14:05:45.16563 | 2021-08-31 14:05:45.16563 |
| 89 | 1 | 1 title 89 | 2021-11-27 14:05:44.983192 | 2021-08-31 14:05:44.983591 | 2021-08-31 14:05:44.983591 |
| 188 | 2 | 2 title 88 | 2021-11-26 14:05:45.163544 | 2021-08-31 14:05:45.163864 | 2021-08-31 14:05:45.163864 |
| 88 | 1 | 1 title 88 | 2021-11-26 14:05:44.981116 | 2021-08-31 14:05:44.981522 | 2021-08-31 14:05:44.981522 |
...
やっぱりいっぱい返ってきますね。当然ですが。
202人のユーザーに100個ずつpostsを作って実行してみました
irb(main):005:0> User.all.includes(:latest_post).to_a; nil
User Load (0.7ms) SELECT "users".* FROM "users"
Post Load (25.4ms) SELECT "posts".* FROM "posts" WHERE "posts"."user_id" IN ($1, $2, ...) ORDER BY "posts"."posted_at" DESC [["user_id", 1], ["user_id", 2], ...]
=> nil
postsは 25.4ms
has_oneで書いてみる EXISTS編
not exits
で最新のデータだけ取れればもっと少ないレコードのみを取得できる気がするのでやってみます
こちらを参考にさせていただきました
class User < ApplicationRecord
has_many :posts
has_one :latest_post, -> {
where(
<<~SQL
NOT EXISTS (
SELECT 1 FROM posts AS p
WHERE
posts.posted_at < p.posted_at AND
posts.user_id = p.user_id
)
SQL
)
}, class_name: :Post
end
これもテストは通りました
SQLを見てみる
irb(main):004:0> User.all.includes(:latest_post).to_a; nil
User Load (0.9ms) SELECT "users".* FROM "users"
Post Load (15.7ms) SELECT "posts".* FROM "posts" WHERE (NOT EXISTS (
SELECT 1 FROM posts AS p
WHERE
posts.posted_at < p.posted_at AND
posts.user_id = p.user_id
)
) AND "posts"."user_id" IN ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $50, $51, $52, $53, $54, $55, $56, $57, $58, $59, $60, $61, $62, $63, $64, $65, $66, $67, $68, $...
postsは 15.7ms
こっちの方が速そうです
postgres@0:dev> SELECT "posts".* FROM "posts" WHERE (NOT EXISTS (SELECT 1 FROM posts AS p WHERE posts.posted_at < p.posted_at AND posts.user_id = p.user_id)) AND "posts"."user_id" IN (1, 2);
+------+-----------+-------------+----------------------------+----------------------------+----------------------------+
| id | user_id | title | posted_at | created_at | updated_at |
|------+-----------+-------------+----------------------------+----------------------------+----------------------------|
| 100 | 1 | 1 title 100 | 2021-12-08 14:12:33.064071 | 2021-08-31 14:12:33.064423 | 2021-08-31 14:12:33.064423 |
| 200 | 2 | 2 title 100 | 2021-12-08 14:12:33.244644 | 2021-08-31 14:12:33.245029 | 2021-08-31 14:12:33.245029 |
+------+-----------+-------------+----------------------------+----------------------------+----------------------------+
SELECT 2
Time: 0.019s
最新のデータしか取ってこないので、こっちの方が良さそう?
ただ、生SQLを書くのもちょっと嫌といえば嫌ですね。
has_oneで書いてみる max編
こちらを参考にさせていただきました
class User < ApplicationRecord
has_many :posts
has_one :latest_post, -> {
where(id: Post.group(:user_id).select("MAX(id)"))
}, class_name: :Post
end
ただしこの方法だと最新のレコードのIDがMAXになるひつようがある
大抵の場合はこの条件を満たすと思いますが
今回のテストケースではあえて意地悪な状況を作る為にposted_atが最新のレコードのIDがmaxではない状況にしているので
テストは通りません
SQLを見てみる
irb(main):001:0> User.all.includes(:latest_post).to_a; nil
User Load (0.7ms) SELECT "users".* FROM "users"
Post Load (9.1ms) SELECT "posts".* FROM "posts" WHERE "posts"."id" IN (SELECT MAX(id) FROM "posts" GROUP BY "posts"."user_id") AND "posts"."user_id" IN ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, $33, $34, $35, $36, $37, $38, $39, $40, $41, $42, $43, ...
postsは 9.1ms
これが最速っぽいですね
postgres@0:dev> SELECT "posts".* FROM "posts" WHERE "posts"."id" IN (SELECT MAX(id) FROM "posts" GROUP BY "posts"."user_id") AND "posts"."user_id" IN (1, 2);
+------+-----------+-------------+----------------------------+----------------------------+----------------------------+
| id | user_id | title | posted_at | created_at | updated_at |
|------+-----------+-------------+----------------------------+----------------------------+----------------------------|
| 100 | 1 | 1 title 100 | 2021-12-08 14:12:33.064071 | 2021-08-31 14:12:33.064423 | 2021-08-31 14:12:33.064423 |
| 200 | 2 | 2 title 100 | 2021-12-08 14:12:33.244644 | 2021-08-31 14:12:33.245029 | 2021-08-31 14:12:33.245029 |
+------+-----------+-------------+----------------------------+----------------------------+----------------------------+
SELECT 2
Time: 0.020s
当然ですが、必要なレコードしか返ってきません。
まとめ
- 今回はindexを全く考慮してないので、indexとデータ量次第で結果は変わりそう
- MAXが使えるのであれば、コードもシンプルになり、パフォーマンスもいいのでMAXを使うのが良さそう
- それ以外の場合はNOT EXISTSを使うのがパフォーマンス&メモリ効率的には良さそう
おまけ ベンチマーク
class User < ApplicationRecord
has_many :posts
has_one :latest_post_order, -> { order(posted_at: :desc) }, class_name: :Post
has_one :latest_post_not_exists, -> {
where(
<<~SQL
NOT EXISTS (
SELECT 1 FROM posts AS p
WHERE
posts.posted_at < p.posted_at AND
posts.user_id = p.user_id
)
SQL
)
}, class_name: :Post
has_one :latest_post_max, -> {
where(id: Post.group(:user_id).select("MAX(id)"))
}, class_name: :Post
end
namespace :bench do
task :latest => :environment do
require "benchmark"
count = 10
Benchmark.bmbm do |x|
x.report("order") { count.times{ User.all.includes(:latest_post_order).to_a } }
x.report("not exists") { count.times{ User.all.includes(:latest_post_not_exists).to_a } }
x.report("max") { count.times{ User.all.includes(:latest_post_max).to_a } }
end
end
end
/app # bin/rails bench:latest
Rehearsal ----------------------------------------------
order 1.742294 0.071785 1.814079 ( 1.979291)
not exists 0.074620 0.000147 0.074767 ( 0.186451)
max 0.072828 0.000000 0.072828 ( 0.151328)
------------------------------------- total: 1.961674sec
user system total real
order 1.581774 0.024608 1.606382 ( 1.759577)
not exists 0.074239 0.000000 0.074239 ( 0.193029)
max 0.070366 0.000000 0.070366 ( 0.154676)
Discussion