FM
class FactorizationMachine(tf.keras.Model):
def __init__(self, num_features, embedding_size):
super(FactorizationMachine, self).__init__()
self.num_features = num_features
self.embedding_size = embedding_size
# 初始化参数
self.w0 = tf.Variable(tf.zeros([1]), name='bias')
self.w = tf.Variable(tf.random.normal([num_features, 1]), name='linear_weights')
self.V = tf.Variable(tf.random.normal([num_features, embedding_size]), name='interaction_weights')
def call(self, inputs):
# embedding 部分
embedding_first = tf.nn.embedding_lookup(self.w, inputs)
embedding = tf.nn.embedding_lookup(self.V, inputs)
value = tf.reshape(inputs, [-1, self.num_features, 1])
embedding_value = tf.multiply(embedding, value)
# 前向传播
# first order 部分
first_order = tf.reduce_sum(tf.multiply(embedding_first, value), axis=1)
liner = tf.add(self.w0, first_order)
# second order 部分
mul_square = tf.square(tf.reduce_sum(embedding_value, 1))
square_mul = tf.reduce_sum(tf.square(embedding_value), 1)
second_order = 0.5 * tf.reduce_sum(tf.subtract(mul_square, square_mul), 1, keepdims=True)
# FM 模型的预测值
fm_model = tf.add(liner, second_order)
return tf.nn.sigmoid(fm_model)
MHA
import tensorflow as tf
def scaled_dot_product_attention(Q, K, V, mask=None):
"""
计算缩放点积注意力
Q: Queries shape == (..., seq_len_q, depth)
K: Keys shape == (..., seq_len_k, depth)
V: Values shape == (..., seq_len_v, depth_v)
mask: Float tensor with shape broadcastable to (..., seq_len_q, seq_len_k)
"""
matmul_qk = tf.matmul(Q, K, transpose_b=True) # (..., seq_len_q, seq_len_k)
# 缩放
dk = tf.cast(tf.shape(K)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
# 添加掩码(mask)
if mask is not None:
scaled_attention_logits += (mask * -1e9)
# softmax归一化
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k)
output = tf.matmul(attention_weights, V) # (..., seq_len_q, depth_v)
return output, attention_weights
class MultiHeadAttention(tf.layers.Layer):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
assert d_model % self.num_heads == 0
self.depth = d_model // self.num_heads
def split_heads(self, x, batch_size):
"""
分割最后一个维度到 (num_heads, depth).
转置结果使得形状为 (batch_size, num_heads, seq_len, depth)
"""
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, v, k, q, mask):
batch_size = tf.shape(q)[0]
q = tf.layers.dense(inputs=q, units=self.d_model, use_bias=False) # (batch_size, seq_len, d_model)
k = tf.layers.dense(inputs=k, units=self.d_model, use_bias=False) # (batch_size, seq_len, d_model)
v = tf.layers.dense(inputs=v, units=self.d_model, use_bias=False) # (batch_size, seq_len, d_model)
q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)
scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth)
concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) # (batch_size, seq_len_q, d_model)
output = tf.layers.dense(inputs=concat_attention, units=self.d_model, use_bias=False) # (batch_size, seq_len_q, d_model)
return output, attention_weights
DIN 简单
def din(self, queries, keys, layer_size):
"""
queries shape: (b,1,e) : (batch_size, feasign_size, embedding_size)
keys shape: (b,f,e) : (batch_size, feasign_size, embedding_size)
"""
queries = tf.tile(queries, [1, keys.shape[1], 1]) # (b,1,e) -> (b,t,e)
din_input = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
din_layer1 = tf.layers.dense(din_input, layer_size[0], activation='sigmoid', name='att1')
dice_layer = self.dice(din_layer1, name='dice_layer') # above activation should be none
din_layer2 = tf.layers.dense(dice_layer, layer_size[1], activation=None, name='att2')
output = din_layer2 * keys
return output
DIN
class DINModel:
def __init__(self, layer_size=(32,128), mom_decay_rate=0.99):
self.mom_decay_rate = mom_decay_rate
self.layer_size = layer_size
self.initializer = tf.contrib.layers.xavier_initializer()
def dice(self, _x, name=''):
alpha = tf.get_variable('dice_alpha'+name, _x.shape[-1], initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32)
_x_norm = tf.layers.batch_normalization(inputs=_x, momentum=self.mom_decay_rate, name="dice_norm"+name)
_x_p = tf.nn.sigmoid(_x_norm)
return alpha * (1.0 - _x_p) * _x + _x_p * _x
def layer_normalize(self,is_ln, inputs, mom_decay_rate, name):
if is_ln:
layer_norm = CustomLayerNormalization()
return layer_norm(inputs=inputs)
else:
return inputs
def din(self, queries, keys, is_bn):
"""
queries shape: (b,s,1,e) : (batch_size, slots_size, feasign_size, embedding_size)
keys shape: (b,s,f,e) : (batch_size, slots_size, feasign_size, embedding_size)
"""
queries = self.layer_normalize(is_bn, queries, self.mom_decay_rate, "queries_norm")
keys = self.layer_normalize(is_bn, keys, self.mom_decay_rate, "keys_norm")
queries = tf.tile(queries, [1, 1, keys.shape[2], 1]) # (b,s,1,e) -> (b,s,t,e)
din_input = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
din_layer1 = tf.layers.dense(din_input, self.layer_size[0], activation='relu', name='layer1', kernel_initializer=self.initializer, bias_initializer=self.initializer)
din_layer2 = self.dice(
tf.layers.dense(din_layer1, self.layer_size[1], activation=None, name='layer2', kernel_initializer=self.initializer, bias_initializer=self.initializer)
)
din_layer3 = tf.layers.dense(din_layer2, 1, activation=None, name='layer3', kernel_initializer=self.initializer, bias_initializer=self.initializer)
# b, s, t, 1
din_score = tf.reshape(din_layer3, [keys.shape[0], keys.shape[1], 1, keys.shape[2]])
output = tf.reduce_sum(din_score * keys, axis = 2) # b, s, e
# output = tf.matmul(din_score, keys) # b, s, t, e
return output
def __call__(self, queries, keys, is_bn=True, name='din'):
with tf.variable_scope(name):
out = self.din(queries, keys, is_bn)
return out
MMOE
gate_hidden1 = 128
expoert_hidden = 512
experts_nums = 4
targets_nums = 3
# input = b, e
experts_out = []
for i in range(experts_nums):
expert_out = tf.layers.dense(input, expert_hidden, activation='relu')
experts_out.appned(expert_out)
all_experts_out = tf.stack(expert_out, axis=1)
def gate_build(input):
gate_hidden = tf.layers.dense(input, units=[gate_hidden1, experts_nums], activation='relu')
gate_out = tf.layers.dense(gate_hidden, experts_nums, activation='softmax')
gate_out = tf.expand_dims(gate_out, axis=1) # b, 4, 1
return gate_out
outs = []
# b, 4, 512
for i in range(targets_nums):
gate = gate_build(input)
outs.append(tf.reduce_sum(gate * all_experts_out, axis = 1))
# out shape [b, 512] * targets_nums
Senet
# 64 * 100 * 8
# reduce mean
# 64 * 100 * 1 = 64 * 100
# 2层mlp(relu, sigmoid)
# 64 * 100 * 1
# 哈达玛积
input = tf.random.uniform(shape=(64, 100, 8))
squ_tensor = tf.reduce_mean(input, axis=-1) # input b * l * e, b * l
out1 = tf.layers.dense(squ_tensor, units=512, activation='relu', name='se1')
out2 = 2 * tf.layers.dense(out1, units=squ_tensor.shape[-1], activation='sigmoid', name='se2‘) # b * l
res = tf.expand_dims(out2, axis = -1) * input
梯度
def get_gradients(qs, input_feas, embedding_size, rv_feas_len):
grad_list = []
# 处理一个batch内的梯度信息
def grad_processing(grad, emb_size, rv_len):
# batch内取均值 然后取绝对值
# 梯度绝对值的大小反应梯度对相关特征的敏感程度,越接近0 敏感程度越低
batch_avg_grad = tf.abs(tf.reduce_mean(grad, axis=0))
rv_grad = batch_avg_grad[:rv_len]
slot_embed_grad = tf.reshape(batch_avg_grad[rv_len:], [-1, emb_size])
# 取一个slot内取最大的绝对值代表该slot中的梯度
slot_grad = tf.reduce_max(slot_embed_grad, axis=1)
return tf.concat([rv_grad, slot_grad], axis=-1)
for i in range(len(qs)):
concat_grad = tf.concat(tf.gradients(qs[i], input_feas), axis=-1)
grad_list.append(grad_processing(concat_grad, embedding_size, rv_feas_len))
return grad_list
计算auc
import numpy as np
from sklearn.metrics import roc_auc_score
# python sklearn包计算auc
def get_auc(y_labels, y_scores):
auc = roc_auc_score(y_labels, y_scores)
print('AUC calculated by sklearn tool is {}'.format(auc))
return auc
# 方法1计算auc
def calcAUC_byProb(labels, probs):
N = 0 # 正样本数量
P = 0 # 负样本数量
neg_prob = [] # 负样本的预测值
pos_prob = [] # 正样本的预测值
for index, label in enumerate(labels):
if label == 1:
# 正样本数++
P += 1
# 把其对应的预测值加到“正样本预测值”列表中
pos_prob.append(probs[index])
else:
# 负样本数++
N += 1
# 把其对应的预测值加到“负样本预测值”列表中
neg_prob.append(probs[index])
number = 0
# 遍历正负样本间的两两组合
for pos in pos_prob:
for neg in neg_prob:
# 如果正样本预测值>负样本预测值,正序对数+1
if (pos > neg):
number += 1
# 如果正样本预测值==负样本预测值,算0.5个正序对
elif (pos == neg):
number += 0.5
return number / (N * P)
# 方法2计算auc, 当预测分相同时,未按照定义使用排序值的均值,而是直接使用排序值,当数据量大时,对auc影响小
def calculate_auc_func2(y_labels, y_scores):
samples = list(zip(y_scores, y_labels))
rank = [(values2, values1) for values1, values2 in sorted(samples, key=lambda x:x[0])]
pos_rank = [i+1 for i in range(len(rank)) if rank[i][0] == 1]
pos_cnt = np.sum(y_labels == 1)
neg_cnt = np.sum(y_labels == 0)
auc = (np.sum(pos_rank) - pos_cnt*(pos_cnt+1)/2) / (pos_cnt*neg_cnt)
print('AUC calculated by function2 is {:.2f}'.format(auc))
return auc
if __name__ == '__main__':
y_labels = np.array([1, 1, 0, 0, 0])
y_scores = np.array([0.4, 0.8, 0.2, 0.4, 0.5])
get_auc(y_labels, y_scores)
calculate_auc_func1(y_labels, y_scores)
calculate_auc_func2(y_labels, y_scores)