Deep learning

FM

class FactorizationMachine(tf.keras.Model):
    def __init__(self, num_features, embedding_size):
        super(FactorizationMachine, self).__init__()
        self.num_features = num_features
        self.embedding_size = embedding_size
        
        # 初始化参数
        self.w0 = tf.Variable(tf.zeros([1]), name='bias')
        self.w = tf.Variable(tf.random.normal([num_features, 1]), name='linear_weights')
        self.V = tf.Variable(tf.random.normal([num_features, embedding_size]), name='interaction_weights')

    def call(self, inputs):
        # embedding 部分
        embedding_first = tf.nn.embedding_lookup(self.w, inputs)
        embedding = tf.nn.embedding_lookup(self.V, inputs)
        value = tf.reshape(inputs, [-1, self.num_features, 1])
        embedding_value = tf.multiply(embedding, value)
        
        # 前向传播
        # first order 部分
        first_order = tf.reduce_sum(tf.multiply(embedding_first, value), axis=1)
        liner = tf.add(self.w0, first_order)
        
        # second order 部分
        mul_square = tf.square(tf.reduce_sum(embedding_value, 1))
        square_mul = tf.reduce_sum(tf.square(embedding_value), 1)
        second_order = 0.5 * tf.reduce_sum(tf.subtract(mul_square, square_mul), 1, keepdims=True)
        
        # FM 模型的预测值
        fm_model = tf.add(liner, second_order)
        return tf.nn.sigmoid(fm_model)

MHA

import tensorflow as tf

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    计算缩放点积注意力
    Q: Queries shape == (..., seq_len_q, depth)
    K: Keys shape == (..., seq_len_k, depth)
    V: Values shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable to (..., seq_len_q, seq_len_k)
    """
    matmul_qk = tf.matmul(Q, K, transpose_b=True)  # (..., seq_len_q, seq_len_k)
    
    # 缩放
    dk = tf.cast(tf.shape(K)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
    # 添加掩码（mask）
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)
    
    # softmax归一化
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
    
    output = tf.matmul(attention_weights, V)  # (..., seq_len_q, depth_v)
    
    return output, attention_weights

class MultiHeadAttention(tf.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % self.num_heads == 0
        
        self.depth = d_model // self.num_heads
    
    def split_heads(self, x, batch_size):
        """
        分割最后一个维度到 (num_heads, depth).
        转置结果使得形状为 (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        
        q = tf.layers.dense(inputs=q, units=self.d_model, use_bias=False)  # (batch_size, seq_len, d_model)
        k = tf.layers.dense(inputs=k, units=self.d_model, use_bias=False)  # (batch_size, seq_len, d_model)
        v = tf.layers.dense(inputs=v, units=self.d_model, use_bias=False)  # (batch_size, seq_len, d_model)
        
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
        
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
        output = tf.layers.dense(inputs=concat_attention, units=self.d_model, use_bias=False)  # (batch_size, seq_len_q, d_model)
        return output, attention_weights

DIN 简单

def din(self, queries, keys, layer_size):
    """
    queries shape: (b,1,e) : (batch_size, feasign_size, embedding_size)
    keys shape: (b,f,e) : (batch_size, feasign_size, embedding_size)
    """
    queries = tf.tile(queries, [1, keys.shape[1], 1]) # (b,1,e) -> (b,t,e)
    din_input = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
    din_layer1 = tf.layers.dense(din_input, layer_size[0], activation='sigmoid', name='att1')
    dice_layer = self.dice(din_layer1, name='dice_layer') # above activation should be none
    din_layer2 = tf.layers.dense(dice_layer, layer_size[1], activation=None, name='att2')
    output = din_layer2 * keys
    return output

DIN

class DINModel:
    def __init__(self, layer_size=(32,128), mom_decay_rate=0.99):
        self.mom_decay_rate = mom_decay_rate
        self.layer_size = layer_size
        self.initializer = tf.contrib.layers.xavier_initializer()

    def dice(self, _x, name=''):
        alpha = tf.get_variable('dice_alpha'+name, _x.shape[-1], initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32)
        _x_norm = tf.layers.batch_normalization(inputs=_x, momentum=self.mom_decay_rate, name="dice_norm"+name)
        _x_p = tf.nn.sigmoid(_x_norm)
        return alpha * (1.0 - _x_p) * _x + _x_p * _x

    def layer_normalize(self,is_ln, inputs, mom_decay_rate, name):
        if is_ln:
            layer_norm = CustomLayerNormalization()
            return layer_norm(inputs=inputs)
        else:
            return inputs
    def din(self, queries, keys, is_bn):
        """
        queries shape: (b,s,1,e) : (batch_size, slots_size, feasign_size, embedding_size)
        keys shape: (b,s,f,e) : (batch_size, slots_size, feasign_size, embedding_size)
        """
        queries = self.layer_normalize(is_bn, queries, self.mom_decay_rate, "queries_norm")
        keys = self.layer_normalize(is_bn, keys, self.mom_decay_rate, "keys_norm")
        queries = tf.tile(queries, [1, 1, keys.shape[2], 1]) # (b,s,1,e) -> (b,s,t,e)
        din_input = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
        din_layer1 = tf.layers.dense(din_input, self.layer_size[0], activation='relu', name='layer1', kernel_initializer=self.initializer, bias_initializer=self.initializer)
        din_layer2 = self.dice(
            tf.layers.dense(din_layer1, self.layer_size[1], activation=None, name='layer2', kernel_initializer=self.initializer, bias_initializer=self.initializer)
            )
        din_layer3 = tf.layers.dense(din_layer2, 1, activation=None, name='layer3', kernel_initializer=self.initializer, bias_initializer=self.initializer)
        # b, s, t, 1
        din_score = tf.reshape(din_layer3, [keys.shape[0], keys.shape[1], 1, keys.shape[2]])
        output = tf.reduce_sum(din_score * keys, axis = 2) # b, s, e
        # output = tf.matmul(din_score, keys) # b, s, t, e
        return output

    def __call__(self, queries, keys, is_bn=True, name='din'):
        with tf.variable_scope(name):
            out = self.din(queries, keys, is_bn)
        return out

MMOE

gate_hidden1 = 128
expoert_hidden = 512
experts_nums = 4
targets_nums = 3
# input = b, e
experts_out = []
for i in range(experts_nums):
    expert_out = tf.layers.dense(input, expert_hidden, activation='relu')
    experts_out.appned(expert_out)
all_experts_out = tf.stack(expert_out, axis=1)


def gate_build(input):
    gate_hidden = tf.layers.dense(input, units=[gate_hidden1, experts_nums], activation='relu')
    gate_out = tf.layers.dense(gate_hidden, experts_nums, activation='softmax')
    gate_out = tf.expand_dims(gate_out, axis=1) # b, 4, 1
    return gate_out
outs = []
# b, 4, 512
for i in range(targets_nums):
    gate = gate_build(input)
    outs.append(tf.reduce_sum(gate * all_experts_out, axis = 1))
# out shape [b, 512] * targets_nums

Senet

# 64 * 100 * 8
# reduce mean
# 64 * 100 * 1  = 64 * 100
# 2层mlp（relu， sigmoid）
# 64 * 100 * 1
# 哈达玛积
input = tf.random.uniform(shape=(64, 100, 8))
squ_tensor = tf.reduce_mean(input, axis=-1) # input b * l * e, b * l
out1 = tf.layers.dense(squ_tensor, units=512, activation='relu', name='se1')
out2 = 2 * tf.layers.dense(out1, units=squ_tensor.shape[-1], activation='sigmoid', name='se2‘) # b * l
res = tf.expand_dims(out2, axis = -1) * input

梯度

def get_gradients(qs, input_feas, embedding_size, rv_feas_len):
    grad_list = []
    # 处理一个batch内的梯度信息
    def grad_processing(grad, emb_size, rv_len):
        # batch内取均值 然后取绝对值
        # 梯度绝对值的大小反应梯度对相关特征的敏感程度，越接近0 敏感程度越低
        batch_avg_grad = tf.abs(tf.reduce_mean(grad, axis=0))
        rv_grad = batch_avg_grad[:rv_len]
        slot_embed_grad = tf.reshape(batch_avg_grad[rv_len:], [-1, emb_size])
        # 取一个slot内取最大的绝对值代表该slot中的梯度
        slot_grad = tf.reduce_max(slot_embed_grad, axis=1)
        return tf.concat([rv_grad, slot_grad], axis=-1)

    for i in range(len(qs)):
        concat_grad = tf.concat(tf.gradients(qs[i], input_feas), axis=-1)
        grad_list.append(grad_processing(concat_grad, embedding_size, rv_feas_len))
    return grad_list

计算auc

import numpy as np
from sklearn.metrics import roc_auc_score
# python sklearn包计算auc
def get_auc(y_labels, y_scores):
    auc = roc_auc_score(y_labels, y_scores)
    print('AUC calculated by sklearn tool is {}'.format(auc))
    return auc
# 方法1计算auc
def calcAUC_byProb(labels, probs):
    N = 0           # 正样本数量
    P = 0           # 负样本数量
    neg_prob = []   # 负样本的预测值
    pos_prob = []   # 正样本的预测值
    for index, label in enumerate(labels):
        if label == 1:
            # 正样本数++
            P += 1
            # 把其对应的预测值加到“正样本预测值”列表中
            pos_prob.append(probs[index])
        else:
            # 负样本数++
            N += 1
            # 把其对应的预测值加到“负样本预测值”列表中
            neg_prob.append(probs[index])
    number = 0
    # 遍历正负样本间的两两组合
    for pos in pos_prob:
        for neg in neg_prob:
            # 如果正样本预测值>负样本预测值，正序对数+1
            if (pos > neg):
                number += 1
            # 如果正样本预测值==负样本预测值，算0.5个正序对
            elif (pos == neg):
                number += 0.5
    return number / (N * P)

# 方法2计算auc, 当预测分相同时，未按照定义使用排序值的均值，而是直接使用排序值，当数据量大时，对auc影响小
def calculate_auc_func2(y_labels, y_scores):
    samples = list(zip(y_scores, y_labels))
    rank = [(values2, values1) for values1, values2 in sorted(samples, key=lambda x:x[0])]
    pos_rank = [i+1 for i in range(len(rank)) if rank[i][0] == 1]
    pos_cnt = np.sum(y_labels == 1)
    neg_cnt = np.sum(y_labels == 0)
    auc = (np.sum(pos_rank) - pos_cnt*(pos_cnt+1)/2) / (pos_cnt*neg_cnt)
    print('AUC calculated by function2 is {:.2f}'.format(auc))
    return auc


if __name__ == '__main__':
    y_labels = np.array([1, 1, 0, 0, 0])
    y_scores = np.array([0.4, 0.8, 0.2, 0.4, 0.5])
    get_auc(y_labels, y_scores)
    calculate_auc_func1(y_labels, y_scores)
    calculate_auc_func2(y_labels, y_scores)